library(tidyverse)
## -- Attaching packages ------------------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.2.1 v purrr 0.3.3
## v tibble 2.1.3 v dplyr 0.8.4
## v tidyr 1.0.2 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## -- Conflicts ---------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
DataCamp offer interactive courses related to R Programming. While some is review, it is helpful to see other perspectives on material. As well, DataCamp has some interesting materials on packages that I want to learn better (ggplot2, dplyr, ggvis, etc.). This document summarizes a few key insights from:
This document is currently split between _v003 and _v003_a and _v003_b and _v003_c and _v003_d due to the need to keep the number of DLL that it opens below the hard-coded maximum. This introductory section needs to be re-written, and the contents consolidated, at a future date.
The original DataCamp_Insights_v001 and DataCamp_Insights_v002 documents have been split for this document:
Chapter 1 - Transforming Data with dplyr
Counties Dataset:
Filter and Arrange Verbs:
Mutate:
Example code includes:
counties <- readRDS("./RInputFiles/counties.rds")
babynames <- readRDS("./RInputFiles/babynames.rds")
str(counties)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 3138 obs. of 40 variables:
## $ census_id : chr "1001" "1003" "1005" "1007" ...
## $ state : chr "Alabama" "Alabama" "Alabama" "Alabama" ...
## $ county : chr "Autauga" "Baldwin" "Barbour" "Bibb" ...
## $ region : chr "South" "South" "South" "South" ...
## $ metro : chr "Metro" "Metro" "Nonmetro" "Metro" ...
## $ population : num 55221 195121 26932 22604 57710 ...
## $ men : num 26745 95314 14497 12073 28512 ...
## $ women : num 28476 99807 12435 10531 29198 ...
## $ hispanic : num 2.6 4.5 4.6 2.2 8.6 4.4 1.2 3.5 0.4 1.5 ...
## $ white : num 75.8 83.1 46.2 74.5 87.9 22.2 53.3 73 57.3 91.7 ...
## $ black : num 18.5 9.5 46.7 21.4 1.5 70.7 43.8 20.3 40.3 4.8 ...
## $ native : num 0.4 0.6 0.2 0.4 0.3 1.2 0.1 0.2 0.2 0.6 ...
## $ asian : num 1 0.7 0.4 0.1 0.1 0.2 0.4 0.9 0.8 0.3 ...
## $ pacific : num 0 0 0 0 0 0 0 0 0 0 ...
## $ citizens : num 40725 147695 20714 17495 42345 ...
## $ income : num 51281 50254 32964 38678 45813 ...
## $ income_err : num 2391 1263 2973 3995 3141 ...
## $ income_per_cap : num 24974 27317 16824 18431 20532 ...
## $ income_per_cap_err: num 1080 711 798 1618 708 ...
## $ poverty : num 12.9 13.4 26.7 16.8 16.7 24.6 25.4 20.5 21.6 19.2 ...
## $ child_poverty : num 18.6 19.2 45.3 27.9 27.2 38.4 39.2 31.6 37.2 30.1 ...
## $ professional : num 33.2 33.1 26.8 21.5 28.5 18.8 27.5 27.3 23.3 29.3 ...
## $ service : num 17 17.7 16.1 17.9 14.1 15 16.6 17.7 14.5 16 ...
## $ office : num 24.2 27.1 23.1 17.8 23.9 19.7 21.9 24.2 26.3 19.5 ...
## $ construction : num 8.6 10.8 10.8 19 13.5 20.1 10.3 10.5 11.5 13.7 ...
## $ production : num 17.1 11.2 23.1 23.7 19.9 26.4 23.7 20.4 24.4 21.5 ...
## $ drive : num 87.5 84.7 83.8 83.2 84.9 74.9 84.5 85.3 85.1 83.9 ...
## $ carpool : num 8.8 8.8 10.9 13.5 11.2 14.9 12.4 9.4 11.9 12.1 ...
## $ transit : num 0.1 0.1 0.4 0.5 0.4 0.7 0 0.2 0.2 0.2 ...
## $ walk : num 0.5 1 1.8 0.6 0.9 5 0.8 1.2 0.3 0.6 ...
## $ other_transp : num 1.3 1.4 1.5 1.5 0.4 1.7 0.6 1.2 0.4 0.7 ...
## $ work_at_home : num 1.8 3.9 1.6 0.7 2.3 2.8 1.7 2.7 2.1 2.5 ...
## $ mean_commute : num 26.5 26.4 24.1 28.8 34.9 27.5 24.6 24.1 25.1 27.4 ...
## $ employed : num 23986 85953 8597 8294 22189 ...
## $ private_work : num 73.6 81.5 71.8 76.8 82 79.5 77.4 74.1 85.1 73.1 ...
## $ public_work : num 20.9 12.3 20.8 16.1 13.5 15.1 16.2 20.8 12.1 18.5 ...
## $ self_employed : num 5.5 5.8 7.3 6.7 4.2 5.4 6.2 5 2.8 7.9 ...
## $ family_work : num 0 0.4 0.1 0.4 0.4 0 0.2 0.1 0 0.5 ...
## $ unemployment : num 7.6 7.5 17.6 8.3 7.7 18 10.9 12.3 8.9 7.9 ...
## $ land_area : num 594 1590 885 623 645 ...
str(babynames)
## Classes 'tbl_df', 'tbl' and 'data.frame': 332595 obs. of 3 variables:
## $ year : num 1880 1880 1880 1880 1880 1880 1880 1880 1880 1880 ...
## $ name : chr "Aaron" "Ab" "Abbie" "Abbott" ...
## $ number: int 102 5 71 5 6 50 9 12 27 81 ...
# Select the columns
counties %>%
select(state, county, population, poverty)
## # A tibble: 3,138 x 4
## state county population poverty
## <chr> <chr> <dbl> <dbl>
## 1 Alabama Autauga 55221 12.9
## 2 Alabama Baldwin 195121 13.4
## 3 Alabama Barbour 26932 26.7
## 4 Alabama Bibb 22604 16.8
## 5 Alabama Blount 57710 16.7
## 6 Alabama Bullock 10678 24.6
## 7 Alabama Butler 20354 25.4
## 8 Alabama Calhoun 116648 20.5
## 9 Alabama Chambers 34079 21.6
## 10 Alabama Cherokee 26008 19.2
## # ... with 3,128 more rows
counties_selected <- counties %>%
select(state, county, population, private_work, public_work, self_employed)
# Add a verb to sort in descending order of public_work
counties_selected %>%
arrange(desc(public_work))
## # A tibble: 3,138 x 6
## state county population private_work public_work self_employed
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Hawaii Kalawao 85 25 64.1 10.9
## 2 Alaska Yukon-Koyukuk Ce~ 5644 33.3 61.7 5.1
## 3 Wisconsin Menominee 4451 36.8 59.1 3.7
## 4 North Da~ Sioux 4380 32.9 56.8 10.2
## 5 South Da~ Todd 9942 34.4 55 9.8
## 6 Alaska Lake and Peninsu~ 1474 42.2 51.6 6.1
## 7 Californ~ Lassen 32645 42.6 50.5 6.8
## 8 South Da~ Buffalo 2038 48.4 49.5 1.8
## 9 South Da~ Dewey 5579 34.9 49.2 14.7
## 10 Texas Kenedy 565 51.9 48.1 0
## # ... with 3,128 more rows
counties_selected <- counties %>%
select(state, county, population)
# Filter for counties in the state of California that have a population above 1000000
counties_selected %>%
filter(state=="California", population > 1000000)
## # A tibble: 9 x 3
## state county population
## <chr> <chr> <dbl>
## 1 California Alameda 1584983
## 2 California Contra Costa 1096068
## 3 California Los Angeles 10038388
## 4 California Orange 3116069
## 5 California Riverside 2298032
## 6 California Sacramento 1465832
## 7 California San Bernardino 2094769
## 8 California San Diego 3223096
## 9 California Santa Clara 1868149
counties_selected <- counties %>%
select(state, county, population, private_work, public_work, self_employed)
# Filter for Texas and more than 10000 people; sort in descending order of private_work
counties_selected %>%
filter(state=="Texas", population > 10000) %>%
arrange(desc(private_work))
## # A tibble: 169 x 6
## state county population private_work public_work self_employed
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Texas Gregg 123178 84.7 9.8 5.4
## 2 Texas Collin 862215 84.1 10 5.8
## 3 Texas Dallas 2485003 83.9 9.5 6.4
## 4 Texas Harris 4356362 83.4 10.1 6.3
## 5 Texas Andrews 16775 83.1 9.6 6.8
## 6 Texas Tarrant 1914526 83.1 11.4 5.4
## 7 Texas Titus 32553 82.5 10 7.4
## 8 Texas Denton 731851 82.2 11.9 5.7
## 9 Texas Ector 149557 82 11.2 6.7
## 10 Texas Moore 22281 82 11.7 5.9
## # ... with 159 more rows
counties_selected <- counties %>%
select(state, county, population, public_work)
# Sort in descending order of the public_workers column
counties_selected %>%
mutate(public_workers = public_work * population / 100) %>%
arrange(desc(public_workers))
## # A tibble: 3,138 x 5
## state county population public_work public_workers
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 California Los Angeles 10038388 11.5 1154415.
## 2 Illinois Cook 5236393 11.5 602185.
## 3 California San Diego 3223096 14.8 477018.
## 4 Arizona Maricopa 4018143 11.7 470123.
## 5 Texas Harris 4356362 10.1 439993.
## 6 New York Kings 2595259 14.4 373717.
## 7 California San Bernardino 2094769 16.7 349826.
## 8 California Riverside 2298032 14.9 342407.
## 9 California Sacramento 1465832 21.8 319551.
## 10 California Orange 3116069 10.2 317839.
## # ... with 3,128 more rows
# Select the columns state, county, population, men, and women
counties_selected <- counties %>%
select(state, county, population, men, women)
# Calculate proportion_women as the fraction of the population made up of women
counties_selected %>%
mutate(proportion_women = women / population)
## # A tibble: 3,138 x 6
## state county population men women proportion_women
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Alabama Autauga 55221 26745 28476 0.516
## 2 Alabama Baldwin 195121 95314 99807 0.512
## 3 Alabama Barbour 26932 14497 12435 0.462
## 4 Alabama Bibb 22604 12073 10531 0.466
## 5 Alabama Blount 57710 28512 29198 0.506
## 6 Alabama Bullock 10678 5660 5018 0.470
## 7 Alabama Butler 20354 9502 10852 0.533
## 8 Alabama Calhoun 116648 56274 60374 0.518
## 9 Alabama Chambers 34079 16258 17821 0.523
## 10 Alabama Cherokee 26008 12975 13033 0.501
## # ... with 3,128 more rows
counties %>%
# Select the five columns
select(state, county, population, men, women) %>%
# Add the proportion_men variable
mutate(proportion_men = men/population) %>%
# Filter for population of at least 10,000
filter(population >= 10000) %>%
# Arrange proportion of men in descending order
arrange(desc(proportion_men))
## # A tibble: 2,437 x 6
## state county population men women proportion_men
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Virginia Sussex 11864 8130 3734 0.685
## 2 California Lassen 32645 21818 10827 0.668
## 3 Georgia Chattahoochee 11914 7940 3974 0.666
## 4 Louisiana West Feliciana 15415 10228 5187 0.664
## 5 Florida Union 15191 9830 5361 0.647
## 6 Texas Jones 19978 12652 7326 0.633
## 7 Missouri DeKalb 12782 8080 4702 0.632
## 8 Texas Madison 13838 8648 5190 0.625
## 9 Virginia Greensville 11760 7303 4457 0.621
## 10 Texas Anderson 57915 35469 22446 0.612
## # ... with 2,427 more rows
Chapter 2 - Aggregating Data
Count Verb:
Group By, Summarize, and Ungroup:
The top_n verb:
Example code includes:
# Use count to find the number of counties in each region
counties %>%
count(region, sort=TRUE)
## # A tibble: 4 x 2
## region n
## <chr> <int>
## 1 South 1420
## 2 North Central 1054
## 3 West 447
## 4 Northeast 217
# Find number of counties per state, weighted by citizens
counties %>%
count(state, wt=citizens, sort=TRUE)
## # A tibble: 50 x 2
## state n
## <chr> <dbl>
## 1 California 24280349
## 2 Texas 16864864
## 3 Florida 13933052
## 4 New York 13531404
## 5 Pennsylvania 9710416
## 6 Illinois 8979999
## 7 Ohio 8709050
## 8 Michigan 7380136
## 9 North Carolina 7107998
## 10 Georgia 6978660
## # ... with 40 more rows
counties %>%
# Add population_walk containing the total number of people who walk to work
mutate(population_walk = walk * population / 100) %>%
# Count weighted by the new column
count(state, wt=population_walk, sort=TRUE)
## # A tibble: 50 x 2
## state n
## <chr> <dbl>
## 1 New York 1237938.
## 2 California 1017964.
## 3 Pennsylvania 505397.
## 4 Texas 430783.
## 5 Illinois 400346.
## 6 Massachusetts 316765.
## 7 Florida 284723.
## 8 New Jersey 273047.
## 9 Ohio 266911.
## 10 Washington 239764.
## # ... with 40 more rows
# Summarize to find minimum population, maximum unexployment, and average income
counties %>%
summarize(min_population=min(population),
max_unemployment=max(unemployment),
average_income=mean(income)
)
## # A tibble: 1 x 3
## min_population max_unemployment average_income
## <dbl> <dbl> <dbl>
## 1 85 29.4 46832.
# Add a density column, then sort in descending order
counties %>%
group_by(state) %>%
summarize(total_area = sum(land_area), total_population = sum(population)) %>%
mutate(density = total_population / total_area) %>%
arrange(desc(density))
## # A tibble: 50 x 4
## state total_area total_population density
## <chr> <dbl> <dbl> <dbl>
## 1 New Jersey 7354. 8904413 1211.
## 2 Rhode Island 1034. 1053661 1019.
## 3 Massachusetts 7800. 6705586 860.
## 4 Connecticut 4842. 3593222 742.
## 5 Maryland 9707. 5930538 611.
## 6 Delaware 1949. 926454 475.
## 7 New York 47126. 19673174 417.
## 8 Florida 53625. 19645772 366.
## 9 Pennsylvania 44743. 12779559 286.
## 10 Ohio 40861. 11575977 283.
## # ... with 40 more rows
# Calculate the average_pop and median_pop columns
counties %>%
group_by(region, state) %>%
summarize(total_pop = sum(population)) %>%
summarize(average_pop = mean(total_pop), median_pop=median(total_pop))
## # A tibble: 4 x 3
## region average_pop median_pop
## <chr> <dbl> <dbl>
## 1 North Central 5627687. 5580644
## 2 Northeast 6221058. 3593222
## 3 South 7370486 4804098
## 4 West 5722755. 2798636
# Group by region and find the greatest number of citizens who walk to work
counties %>%
group_by(region) %>%
top_n(1, walk) %>%
select(state, county, region, metro, population, walk, citizens)
## # A tibble: 4 x 7
## # Groups: region [4]
## state county region metro population walk citizens
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 Alaska Aleutians East Boro~ West Nonmet~ 3304 71.2 1874
## 2 New York New York Northeast Metro 1629507 20.7 1156936
## 3 North Dako~ McIntosh North Cent~ Nonmet~ 2759 17.5 2239
## 4 Virginia Lexington city South Nonmet~ 7071 31.7 6261
counties %>%
group_by(region, state) %>%
# Calculate average income
summarize(average_income=mean(income)) %>%
# Find the highest income state in each region
top_n(1, average_income)
## # A tibble: 4 x 3
## # Groups: region [4]
## region state average_income
## <chr> <chr> <dbl>
## 1 North Central North Dakota 55575.
## 2 Northeast New Jersey 73014.
## 3 South Maryland 69200.
## 4 West Alaska 65125.
# Count the states with more people in Metro or Nonmetro areas
counties %>%
group_by(state, metro) %>%
summarize(total_pop = sum(population)) %>%
top_n(1, total_pop) %>%
ungroup() %>%
count(metro)
## # A tibble: 2 x 2
## metro n
## <chr> <int>
## 1 Metro 44
## 2 Nonmetro 6
Chapter 3 - Selecting and Transforming Data
Selecting:
Renaming:
Transmuting:
Example code includes:
# Glimpse the counties table
glimpse(counties)
## Observations: 3,138
## Variables: 40
## $ census_id <chr> "1001", "1003", "1005", "1007", "1009", "1011", ...
## $ state <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Ala...
## $ county <chr> "Autauga", "Baldwin", "Barbour", "Bibb", "Blount...
## $ region <chr> "South", "South", "South", "South", "South", "So...
## $ metro <chr> "Metro", "Metro", "Nonmetro", "Metro", "Metro", ...
## $ population <dbl> 55221, 195121, 26932, 22604, 57710, 10678, 20354...
## $ men <dbl> 26745, 95314, 14497, 12073, 28512, 5660, 9502, 5...
## $ women <dbl> 28476, 99807, 12435, 10531, 29198, 5018, 10852, ...
## $ hispanic <dbl> 2.6, 4.5, 4.6, 2.2, 8.6, 4.4, 1.2, 3.5, 0.4, 1.5...
## $ white <dbl> 75.8, 83.1, 46.2, 74.5, 87.9, 22.2, 53.3, 73.0, ...
## $ black <dbl> 18.5, 9.5, 46.7, 21.4, 1.5, 70.7, 43.8, 20.3, 40...
## $ native <dbl> 0.4, 0.6, 0.2, 0.4, 0.3, 1.2, 0.1, 0.2, 0.2, 0.6...
## $ asian <dbl> 1.0, 0.7, 0.4, 0.1, 0.1, 0.2, 0.4, 0.9, 0.8, 0.3...
## $ pacific <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0...
## $ citizens <dbl> 40725, 147695, 20714, 17495, 42345, 8057, 15581,...
## $ income <dbl> 51281, 50254, 32964, 38678, 45813, 31938, 32229,...
## $ income_err <dbl> 2391, 1263, 2973, 3995, 3141, 5884, 1793, 925, 2...
## $ income_per_cap <dbl> 24974, 27317, 16824, 18431, 20532, 17580, 18390,...
## $ income_per_cap_err <dbl> 1080, 711, 798, 1618, 708, 2055, 714, 489, 1366,...
## $ poverty <dbl> 12.9, 13.4, 26.7, 16.8, 16.7, 24.6, 25.4, 20.5, ...
## $ child_poverty <dbl> 18.6, 19.2, 45.3, 27.9, 27.2, 38.4, 39.2, 31.6, ...
## $ professional <dbl> 33.2, 33.1, 26.8, 21.5, 28.5, 18.8, 27.5, 27.3, ...
## $ service <dbl> 17.0, 17.7, 16.1, 17.9, 14.1, 15.0, 16.6, 17.7, ...
## $ office <dbl> 24.2, 27.1, 23.1, 17.8, 23.9, 19.7, 21.9, 24.2, ...
## $ construction <dbl> 8.6, 10.8, 10.8, 19.0, 13.5, 20.1, 10.3, 10.5, 1...
## $ production <dbl> 17.1, 11.2, 23.1, 23.7, 19.9, 26.4, 23.7, 20.4, ...
## $ drive <dbl> 87.5, 84.7, 83.8, 83.2, 84.9, 74.9, 84.5, 85.3, ...
## $ carpool <dbl> 8.8, 8.8, 10.9, 13.5, 11.2, 14.9, 12.4, 9.4, 11....
## $ transit <dbl> 0.1, 0.1, 0.4, 0.5, 0.4, 0.7, 0.0, 0.2, 0.2, 0.2...
## $ walk <dbl> 0.5, 1.0, 1.8, 0.6, 0.9, 5.0, 0.8, 1.2, 0.3, 0.6...
## $ other_transp <dbl> 1.3, 1.4, 1.5, 1.5, 0.4, 1.7, 0.6, 1.2, 0.4, 0.7...
## $ work_at_home <dbl> 1.8, 3.9, 1.6, 0.7, 2.3, 2.8, 1.7, 2.7, 2.1, 2.5...
## $ mean_commute <dbl> 26.5, 26.4, 24.1, 28.8, 34.9, 27.5, 24.6, 24.1, ...
## $ employed <dbl> 23986, 85953, 8597, 8294, 22189, 3865, 7813, 474...
## $ private_work <dbl> 73.6, 81.5, 71.8, 76.8, 82.0, 79.5, 77.4, 74.1, ...
## $ public_work <dbl> 20.9, 12.3, 20.8, 16.1, 13.5, 15.1, 16.2, 20.8, ...
## $ self_employed <dbl> 5.5, 5.8, 7.3, 6.7, 4.2, 5.4, 6.2, 5.0, 2.8, 7.9...
## $ family_work <dbl> 0.0, 0.4, 0.1, 0.4, 0.4, 0.0, 0.2, 0.1, 0.0, 0.5...
## $ unemployment <dbl> 7.6, 7.5, 17.6, 8.3, 7.7, 18.0, 10.9, 12.3, 8.9,...
## $ land_area <dbl> 594.44, 1589.78, 884.88, 622.58, 644.78, 622.81,...
counties %>%
# Select state, county, population, and industry-related columns
select(state, county, population, professional:production) %>%
# Arrange service in descending order
arrange(desc(service))
## # A tibble: 3,138 x 8
## state county population professional service office construction production
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Missis~ Tunica 10477 23.9 36.6 21.5 3.5 14.5
## 2 Texas Kinney 3577 30 36.5 11.6 20.5 1.3
## 3 Texas Kenedy 565 24.9 34.1 20.5 20.5 0
## 4 New Yo~ Bronx 1428357 24.3 33.3 24.2 7.1 11
## 5 Texas Brooks 7221 19.6 32.4 25.3 11.1 11.5
## 6 Colora~ Fremo~ 46809 26.6 32.2 22.8 10.7 7.6
## 7 Texas Culbe~ 2296 20.1 32.2 24.2 15.7 7.8
## 8 Califo~ Del N~ 27788 33.9 31.5 18.8 8.9 6.8
## 9 Minnes~ Mahno~ 5496 26.8 31.5 18.7 13.1 9.9
## 10 Virgin~ Lanca~ 11129 30.3 31.2 22.8 8.1 7.6
## # ... with 3,128 more rows
counties %>%
# Select the state, county, population, and those ending with "work"
select(state, county, population, ends_with("work")) %>%
# Filter for counties that have at least 50% of people engaged in public work
filter(public_work >= 50)
## # A tibble: 7 x 6
## state county population private_work public_work family_work
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Alaska Lake and Peninsula~ 1474 42.2 51.6 0.2
## 2 Alaska Yukon-Koyukuk Cens~ 5644 33.3 61.7 0
## 3 California Lassen 32645 42.6 50.5 0.1
## 4 Hawaii Kalawao 85 25 64.1 0
## 5 North Dak~ Sioux 4380 32.9 56.8 0.1
## 6 South Dak~ Todd 9942 34.4 55 0.8
## 7 Wisconsin Menominee 4451 36.8 59.1 0.4
# Rename the n column to num_counties
counties %>%
count(state) %>%
rename(num_counties=n)
## # A tibble: 50 x 2
## state num_counties
## <chr> <int>
## 1 Alabama 67
## 2 Alaska 28
## 3 Arizona 15
## 4 Arkansas 75
## 5 California 58
## 6 Colorado 64
## 7 Connecticut 8
## 8 Delaware 3
## 9 Florida 67
## 10 Georgia 159
## # ... with 40 more rows
# Select state, county, and poverty as poverty_rate
counties %>%
select(state, county, poverty_rate = poverty)
## # A tibble: 3,138 x 3
## state county poverty_rate
## <chr> <chr> <dbl>
## 1 Alabama Autauga 12.9
## 2 Alabama Baldwin 13.4
## 3 Alabama Barbour 26.7
## 4 Alabama Bibb 16.8
## 5 Alabama Blount 16.7
## 6 Alabama Bullock 24.6
## 7 Alabama Butler 25.4
## 8 Alabama Calhoun 20.5
## 9 Alabama Chambers 21.6
## 10 Alabama Cherokee 19.2
## # ... with 3,128 more rows
counties %>%
# Keep the state, county, and populations columns, and add a density column
transmute(state, county, population, density = population / land_area) %>%
# Filter for counties with a population greater than one million
filter(population > 1000000) %>%
# Sort density in ascending order
arrange(density)
## # A tibble: 41 x 4
## state county population density
## <chr> <chr> <dbl> <dbl>
## 1 California San Bernardino 2094769 104.
## 2 Nevada Clark 2035572 258.
## 3 California Riverside 2298032 319.
## 4 Arizona Maricopa 4018143 437.
## 5 Florida Palm Beach 1378806 700.
## 6 California San Diego 3223096 766.
## 7 Washington King 2045756 967.
## 8 Texas Travis 1121645 1133.
## 9 Florida Hillsborough 1302884 1277.
## 10 Florida Orange 1229039 1360.
## # ... with 31 more rows
# Change the name of the unemployment column
counties %>%
rename(unemployment_rate = unemployment)
## # A tibble: 3,138 x 40
## census_id state county region metro population men women hispanic white
## <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1001 Alab~ Autau~ South Metro 55221 26745 28476 2.6 75.8
## 2 1003 Alab~ Baldw~ South Metro 195121 95314 99807 4.5 83.1
## 3 1005 Alab~ Barbo~ South Nonm~ 26932 14497 12435 4.6 46.2
## 4 1007 Alab~ Bibb South Metro 22604 12073 10531 2.2 74.5
## 5 1009 Alab~ Blount South Metro 57710 28512 29198 8.6 87.9
## 6 1011 Alab~ Bullo~ South Nonm~ 10678 5660 5018 4.4 22.2
## 7 1013 Alab~ Butler South Nonm~ 20354 9502 10852 1.2 53.3
## 8 1015 Alab~ Calho~ South Metro 116648 56274 60374 3.5 73
## 9 1017 Alab~ Chamb~ South Nonm~ 34079 16258 17821 0.4 57.3
## 10 1019 Alab~ Chero~ South Nonm~ 26008 12975 13033 1.5 91.7
## # ... with 3,128 more rows, and 30 more variables: black <dbl>, native <dbl>,
## # asian <dbl>, pacific <dbl>, citizens <dbl>, income <dbl>, income_err <dbl>,
## # income_per_cap <dbl>, income_per_cap_err <dbl>, poverty <dbl>,
## # child_poverty <dbl>, professional <dbl>, service <dbl>, office <dbl>,
## # construction <dbl>, production <dbl>, drive <dbl>, carpool <dbl>,
## # transit <dbl>, walk <dbl>, other_transp <dbl>, work_at_home <dbl>,
## # mean_commute <dbl>, employed <dbl>, private_work <dbl>, public_work <dbl>,
## # self_employed <dbl>, family_work <dbl>, unemployment_rate <dbl>,
## # land_area <dbl>
# Keep the state and county columns, and the columns containing poverty
counties %>%
select(state, county, contains("poverty"))
## # A tibble: 3,138 x 4
## state county poverty child_poverty
## <chr> <chr> <dbl> <dbl>
## 1 Alabama Autauga 12.9 18.6
## 2 Alabama Baldwin 13.4 19.2
## 3 Alabama Barbour 26.7 45.3
## 4 Alabama Bibb 16.8 27.9
## 5 Alabama Blount 16.7 27.2
## 6 Alabama Bullock 24.6 38.4
## 7 Alabama Butler 25.4 39.2
## 8 Alabama Calhoun 20.5 31.6
## 9 Alabama Chambers 21.6 37.2
## 10 Alabama Cherokee 19.2 30.1
## # ... with 3,128 more rows
# Calculate the fraction_women column without dropping the other columns
counties %>%
mutate(fraction_women = women / population)
## # A tibble: 3,138 x 41
## census_id state county region metro population men women hispanic white
## <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1001 Alab~ Autau~ South Metro 55221 26745 28476 2.6 75.8
## 2 1003 Alab~ Baldw~ South Metro 195121 95314 99807 4.5 83.1
## 3 1005 Alab~ Barbo~ South Nonm~ 26932 14497 12435 4.6 46.2
## 4 1007 Alab~ Bibb South Metro 22604 12073 10531 2.2 74.5
## 5 1009 Alab~ Blount South Metro 57710 28512 29198 8.6 87.9
## 6 1011 Alab~ Bullo~ South Nonm~ 10678 5660 5018 4.4 22.2
## 7 1013 Alab~ Butler South Nonm~ 20354 9502 10852 1.2 53.3
## 8 1015 Alab~ Calho~ South Metro 116648 56274 60374 3.5 73
## 9 1017 Alab~ Chamb~ South Nonm~ 34079 16258 17821 0.4 57.3
## 10 1019 Alab~ Chero~ South Nonm~ 26008 12975 13033 1.5 91.7
## # ... with 3,128 more rows, and 31 more variables: black <dbl>, native <dbl>,
## # asian <dbl>, pacific <dbl>, citizens <dbl>, income <dbl>, income_err <dbl>,
## # income_per_cap <dbl>, income_per_cap_err <dbl>, poverty <dbl>,
## # child_poverty <dbl>, professional <dbl>, service <dbl>, office <dbl>,
## # construction <dbl>, production <dbl>, drive <dbl>, carpool <dbl>,
## # transit <dbl>, walk <dbl>, other_transp <dbl>, work_at_home <dbl>,
## # mean_commute <dbl>, employed <dbl>, private_work <dbl>, public_work <dbl>,
## # self_employed <dbl>, family_work <dbl>, unemployment <dbl>,
## # land_area <dbl>, fraction_women <dbl>
# Keep only the state, county, and employment_rate columns
counties %>%
transmute(state, county, employment_rate = employed / population)
## # A tibble: 3,138 x 3
## state county employment_rate
## <chr> <chr> <dbl>
## 1 Alabama Autauga 0.434
## 2 Alabama Baldwin 0.441
## 3 Alabama Barbour 0.319
## 4 Alabama Bibb 0.367
## 5 Alabama Blount 0.384
## 6 Alabama Bullock 0.362
## 7 Alabama Butler 0.384
## 8 Alabama Calhoun 0.406
## 9 Alabama Chambers 0.402
## 10 Alabama Cherokee 0.390
## # ... with 3,128 more rows
Chapter 4 - Case Study
The babynames dataset:
Grouped Mutates:
Window Functions:
Wrap Up:
Example code includes:
babynames %>%
# Filter for the year 1990
filter(year==1990) %>%
# Sort the number column in descending order
arrange(desc(number))
## # A tibble: 21,223 x 3
## year name number
## <dbl> <chr> <int>
## 1 1990 Michael 65560
## 2 1990 Christopher 52520
## 3 1990 Jessica 46615
## 4 1990 Ashley 45797
## 5 1990 Matthew 44925
## 6 1990 Joshua 43382
## 7 1990 Brittany 36650
## 8 1990 Amanda 34504
## 9 1990 Daniel 33963
## 10 1990 David 33862
## # ... with 21,213 more rows
# Find the most common name in each year
babynames %>%
group_by(year) %>%
top_n(1, number)
## # A tibble: 28 x 3
## # Groups: year [28]
## year name number
## <dbl> <chr> <int>
## 1 1880 John 9701
## 2 1885 Mary 9166
## 3 1890 Mary 12113
## 4 1895 Mary 13493
## 5 1900 Mary 16781
## 6 1905 Mary 16135
## 7 1910 Mary 22947
## 8 1915 Mary 58346
## 9 1920 Mary 71175
## 10 1925 Mary 70857
## # ... with 18 more rows
# Filter for the names Steven, Thomas, and Matthew
selected_names <- babynames %>%
filter(name %in% c("Steven", "Thomas", "Matthew"))
# Plot the names using a different color for each name
ggplot(selected_names, aes(x = year, y = number, color = name)) +
geom_line()
# Find the year each name is most common
babynames %>%
group_by(year) %>%
mutate(year_total=sum(number)) %>%
ungroup() %>%
mutate(fraction = number / year_total) %>%
group_by(name) %>%
top_n(1, fraction)
## # A tibble: 48,040 x 5
## # Groups: name [48,040]
## year name number year_total fraction
## <dbl> <chr> <int> <int> <dbl>
## 1 1880 Abbott 5 201478 0.0000248
## 2 1880 Abe 50 201478 0.000248
## 3 1880 Abner 27 201478 0.000134
## 4 1880 Adelbert 28 201478 0.000139
## 5 1880 Adella 26 201478 0.000129
## 6 1880 Adolf 6 201478 0.0000298
## 7 1880 Adolph 93 201478 0.000462
## 8 1880 Agustus 5 201478 0.0000248
## 9 1880 Albert 1493 201478 0.00741
## 10 1880 Albertina 7 201478 0.0000347
## # ... with 48,030 more rows
names_normalized <- babynames %>%
group_by(name) %>%
mutate(name_total = sum(number), name_max = max(number)) %>%
# Ungroup the table
ungroup() %>%
# Add the fraction_max column containing the number by the name maximum
mutate(fraction_max = number / name_max)
names_normalized
## # A tibble: 332,595 x 6
## year name number name_total name_max fraction_max
## <dbl> <chr> <int> <int> <int> <dbl>
## 1 1880 Aaron 102 114739 14635 0.00697
## 2 1880 Ab 5 77 31 0.161
## 3 1880 Abbie 71 4330 445 0.160
## 4 1880 Abbott 5 217 51 0.0980
## 5 1880 Abby 6 11272 1753 0.00342
## 6 1880 Abe 50 1832 271 0.185
## 7 1880 Abel 9 10565 3245 0.00277
## 8 1880 Abigail 12 72600 15762 0.000761
## 9 1880 Abner 27 1552 199 0.136
## 10 1880 Abraham 81 17882 2449 0.0331
## # ... with 332,585 more rows
# Filter for the names Steven, Thomas, and Matthew
names_filtered <- names_normalized %>%
filter(name %in% c("Steven", "Thomas", "Matthew"))
# Visualize these names over time
ggplot(names_filtered, aes(x=year, y=fraction_max, color=name)) +
geom_line()
# Find the year each name is most common
babynames_fraction <- babynames %>%
group_by(year) %>%
mutate(year_total=sum(number)) %>%
ungroup() %>%
mutate(fraction = number / year_total)
babynames_fraction
## # A tibble: 332,595 x 5
## year name number year_total fraction
## <dbl> <chr> <int> <int> <dbl>
## 1 1880 Aaron 102 201478 0.000506
## 2 1880 Ab 5 201478 0.0000248
## 3 1880 Abbie 71 201478 0.000352
## 4 1880 Abbott 5 201478 0.0000248
## 5 1880 Abby 6 201478 0.0000298
## 6 1880 Abe 50 201478 0.000248
## 7 1880 Abel 9 201478 0.0000447
## 8 1880 Abigail 12 201478 0.0000596
## 9 1880 Abner 27 201478 0.000134
## 10 1880 Abraham 81 201478 0.000402
## # ... with 332,585 more rows
babynames_fraction %>%
# Arrange the data in order of name, then year
arrange(name, year) %>%
# Group the data by name
group_by(name) %>%
# Add a ratio column that contains the ratio between each year
mutate(ratio = fraction / lag(fraction))
## # A tibble: 332,595 x 6
## # Groups: name [48,040]
## year name number year_total fraction ratio
## <dbl> <chr> <int> <int> <dbl> <dbl>
## 1 2010 Aaban 9 3672066 0.00000245 NA
## 2 2015 Aaban 15 3648781 0.00000411 1.68
## 3 1995 Aadam 6 3652750 0.00000164 NA
## 4 2000 Aadam 6 3767293 0.00000159 0.970
## 5 2005 Aadam 6 3828460 0.00000157 0.984
## 6 2010 Aadam 7 3672066 0.00000191 1.22
## 7 2015 Aadam 22 3648781 0.00000603 3.16
## 8 2010 Aadan 11 3672066 0.00000300 NA
## 9 2015 Aadan 10 3648781 0.00000274 0.915
## 10 2000 Aadarsh 5 3767293 0.00000133 NA
## # ... with 332,585 more rows
babynames_ratios_filtered <- babynames_fraction %>%
arrange(name, year) %>%
group_by(name) %>%
mutate(ratio = fraction / lag(fraction)) %>%
filter(fraction >= 0.00001)
babynames_ratios_filtered
## # A tibble: 104,344 x 6
## # Groups: name [14,807]
## year name number year_total fraction ratio
## <dbl> <chr> <int> <int> <dbl> <dbl>
## 1 2010 Aaden 450 3672066 0.000123 14.2
## 2 2015 Aaden 297 3648781 0.0000814 0.664
## 3 2015 Aadhya 265 3648781 0.0000726 14.0
## 4 2005 Aadi 51 3828460 0.0000133 NA
## 5 2010 Aadi 54 3672066 0.0000147 1.10
## 6 2015 Aadi 43 3648781 0.0000118 0.801
## 7 2010 Aaditya 37 3672066 0.0000101 1.48
## 8 2015 Aadya 159 3648781 0.0000436 4.85
## 9 2010 Aadyn 38 3672066 0.0000103 3.60
## 10 2010 Aahana 64 3672066 0.0000174 5.13
## # ... with 104,334 more rows
babynames_ratios_filtered %>%
# Extract the largest ratio from each name
top_n(1, ratio) %>%
# Sort the ratio column in descending order
arrange(desc(ratio)) %>%
# Filter for fractions greater than or equal to 0.001
filter(fraction >= 0.001)
## # A tibble: 291 x 6
## # Groups: name [291]
## year name number year_total fraction ratio
## <dbl> <chr> <int> <int> <dbl> <dbl>
## 1 1960 Tammy 14365 4152075 0.00346 70.1
## 2 2005 Nevaeh 4610 3828460 0.00120 45.8
## 3 1940 Brenda 5460 2301630 0.00237 37.5
## 4 1885 Grover 774 240822 0.00321 36.0
## 5 1945 Cheryl 8170 2652029 0.00308 24.9
## 6 1955 Lori 4980 4012691 0.00124 23.2
## 7 2010 Khloe 5411 3672066 0.00147 23.2
## 8 1950 Debra 6189 3502592 0.00177 22.6
## 9 2010 Bentley 4001 3672066 0.00109 22.4
## 10 1935 Marlene 4840 2088487 0.00232 16.8
## # ... with 281 more rows
Chapter 1 - How to Write a Function
Rationale for Using Functions:
Converting Scripts in to Functions:
Code Readability:
Example code includes:
gold_medals <- c(46, 27, 26, 19, 17, 12, 10, 9, 8, 8, 8, 8, 7, 7, 6, 6, 5, 5, 4, 4, 4, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, NA)
names(gold_medals) <- c('USA', 'GBR', 'CHN', 'RUS', 'GER', 'JPN', 'FRA', 'KOR', 'ITA', 'AUS', 'NED', 'HUN', 'BRA', 'ESP', 'KEN', 'JAM', 'CRO', 'CUB', 'NZL', 'CAN', 'UZB', 'KAZ', 'COL', 'SUI', 'IRI', 'GRE', 'ARG', 'DEN', 'SWE', 'RSA', 'UKR', 'SRB', 'POL', 'PRK', 'BEL', 'THA', 'SVK', 'GEO', 'AZE', 'BLR', 'TUR', 'ARM', 'CZE', 'ETH', 'SLO', 'INA', 'ROU', 'BRN', 'VIE', 'TPE', 'BAH', 'IOA', 'CIV', 'FIJ', 'JOR', 'KOS', 'PUR', 'SIN', 'TJK', 'MAS', 'MEX', 'VEN', 'ALG', 'IRL', 'LTU', 'BUL', 'IND', 'MGL', 'BDI', 'GRN', 'NIG', 'PHI', 'QAT', 'NOR', 'EGY', 'TUN', 'ISR', 'AUT', 'DOM', 'EST', 'FIN', 'MAR', 'NGR', 'POR', 'TTO', 'UAE', 'IOC')
# Look at the gold medals data
gold_medals
## USA GBR CHN RUS GER JPN FRA KOR ITA AUS NED HUN BRA ESP KEN JAM CRO CUB NZL CAN
## 46 27 26 19 17 12 10 9 8 8 8 8 7 7 6 6 5 5 4 4
## UZB KAZ COL SUI IRI GRE ARG DEN SWE RSA UKR SRB POL PRK BEL THA SVK GEO AZE BLR
## 4 3 3 3 3 3 3 2 2 2 2 2 2 2 2 2 2 2 1 1
## TUR ARM CZE ETH SLO INA ROU BRN VIE TPE BAH IOA CIV FIJ JOR KOS PUR SIN TJK MAS
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0
## MEX VEN ALG IRL LTU BUL IND MGL BDI GRN NIG PHI QAT NOR EGY TUN ISR AUT DOM EST
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## FIN MAR NGR POR TTO UAE IOC
## 0 0 0 0 0 0 NA
# Note the arguments to median()
args(median)
## function (x, na.rm = FALSE, ...)
## NULL
# Rewrite this function call, following best practices
median(gold_medals, na.rm=TRUE)
## [1] 1
# Note the arguments to rank()
args(rank)
## function (x, na.last = TRUE, ties.method = c("average", "first",
## "last", "random", "max", "min"))
## NULL
# Rewrite this function call, following best practices
rank(-gold_medals, na.last="keep", ties.method = "min")
## USA GBR CHN RUS GER JPN FRA KOR ITA AUS NED HUN BRA ESP KEN JAM CRO CUB NZL CAN
## 1 2 3 4 5 6 7 8 9 9 9 9 13 13 15 15 17 17 19 19
## UZB KAZ COL SUI IRI GRE ARG DEN SWE RSA UKR SRB POL PRK BEL THA SVK GEO AZE BLR
## 19 22 22 22 22 22 22 28 28 28 28 28 28 28 28 28 28 28 39 39
## TUR ARM CZE ETH SLO INA ROU BRN VIE TPE BAH IOA CIV FIJ JOR KOS PUR SIN TJK MAS
## 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 60
## MEX VEN ALG IRL LTU BUL IND MGL BDI GRN NIG PHI QAT NOR EGY TUN ISR AUT DOM EST
## 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60
## FIN MAR NGR POR TTO UAE IOC
## 60 60 60 60 60 60 NA
coin_sides <- c("head", "tail")
# Sample from coin_sides once
sample(coin_sides, 1)
## [1] "head"
# Your functions, from previous steps
toss_coin <- function() {
coin_sides <- c("head", "tail")
sample(coin_sides, 1)
}
# Call your function
toss_coin()
## [1] "head"
# Update the function to return n coin tosses
toss_coin <- function(n_flips) {
coin_sides <- c("head", "tail")
sample(coin_sides, n_flips, replace=TRUE)
}
# Generate 10 coin tosses
toss_coin(10)
## [1] "head" "head" "head" "tail" "tail" "head" "tail" "tail" "head" "head"
# Update the function so heads have probability p_head
toss_coin <- function(n_flips, p_head) {
coin_sides <- c("head", "tail")
# Define a vector of weights
weights <- c(p_head, 1-p_head)
# Modify the sampling to be weighted
sample(coin_sides, n_flips, replace = TRUE, prob=weights)
}
# Generate 10 coin tosses
toss_coin(10, p_head=0.8)
## [1] "head" "head" "head" "head" "head" "head" "head" "head" "head" "head"
snake_river_visits <- readRDS("./RInputFiles/snake_river_visits.rds")
str(snake_river_visits)
## 'data.frame': 410 obs. of 4 variables:
## $ n_visits: num 0 0 0 0 0 0 0 0 0 0 ...
## $ gender : Factor w/ 2 levels "male","female": 1 1 1 2 1 2 2 2 1 1 ...
## $ income : Factor w/ 4 levels "[$0,$25k]","($25k,$55k]",..: 4 2 4 2 4 2 4 4 4 4 ...
## $ travel : Factor w/ 3 levels "[0h,0.25h]","(0.25h,4h]",..: NA NA NA NA NA NA NA NA NA NA ...
# Run a generalized linear regression
glm(
# Model no. of visits vs. gender, income, travel
n_visits ~ gender + income + travel,
# Use the snake_river_visits dataset
data = snake_river_visits,
# Make it a Poisson regression
family = poisson
)
##
## Call: glm(formula = n_visits ~ gender + income + travel, family = poisson,
## data = snake_river_visits)
##
## Coefficients:
## (Intercept) genderfemale income($25k,$55k] income($55k,$95k]
## 4.0864 0.3740 -0.0199 -0.5807
## income($95k,$Inf) travel(0.25h,4h] travel(4h,Infh)
## -0.5782 -0.6271 -2.4230
##
## Degrees of Freedom: 345 Total (i.e. Null); 339 Residual
## (64 observations deleted due to missingness)
## Null Deviance: 18850
## Residual Deviance: 11530 AIC: 12860
# From previous step
run_poisson_regression <- function(data, formula) {
glm(formula, data, family = poisson)
}
# Re-run the Poisson regression, using your function
model <- snake_river_visits %>%
run_poisson_regression(n_visits ~ gender + income + travel)
icLevels <- c("[$0,$25k]", "($25k,$55k]", "($55k,$95k]", "($95k,$Inf)")
trLevels <- c("[0h,0.25h]", "(0.25h,4h]", "(4h,Infh)")
srGender <- c("male", "female")[c(1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2)]
srIncome <- icLevels[c(1, 1, 2, 2, 3, 3, 4, 4, 1, 1, 2, 2, 3, 3, 4, 4, 1, 1, 2, 2, 3, 3, 4, 4)]
srTravel <- trLevels[c(1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3)]
snake_river_explanatory <- data.frame(gender=factor(srGender, levels=c("male", "female")),
income=factor(srIncome, levels=icLevels),
travel=factor(srTravel, levels=trLevels)
)
str(snake_river_explanatory)
## 'data.frame': 24 obs. of 3 variables:
## $ gender: Factor w/ 2 levels "male","female": 1 2 1 2 1 2 1 2 1 2 ...
## $ income: Factor w/ 4 levels "[$0,$25k]","($25k,$55k]",..: 1 1 2 2 3 3 4 4 1 1 ...
## $ travel: Factor w/ 3 levels "[0h,0.25h]","(0.25h,4h]",..: 1 1 1 1 1 1 1 1 2 2 ...
# Run this to see the predictions
snake_river_explanatory %>%
mutate(predicted_n_visits = predict(model, ., type = "response"))%>%
arrange(desc(predicted_n_visits))
## gender income travel predicted_n_visits
## 1 female [$0,$25k] [0h,0.25h] 86.518598
## 2 female ($25k,$55k] [0h,0.25h] 84.813684
## 3 male [$0,$25k] [0h,0.25h] 59.524843
## 4 male ($25k,$55k] [0h,0.25h] 58.351861
## 5 female ($95k,$Inf) [0h,0.25h] 48.526883
## 6 female ($55k,$95k] [0h,0.25h] 48.408009
## 7 female [$0,$25k] (0.25h,4h] 46.212343
## 8 female ($25k,$55k] (0.25h,4h] 45.301694
## 9 male ($95k,$Inf) [0h,0.25h] 33.386522
## 10 male ($55k,$95k] [0h,0.25h] 33.304737
## 11 male [$0,$25k] (0.25h,4h] 31.794117
## 12 male ($25k,$55k] (0.25h,4h] 31.167590
## 13 female ($95k,$Inf) (0.25h,4h] 25.919756
## 14 female ($55k,$95k] (0.25h,4h] 25.856261
## 15 male ($95k,$Inf) (0.25h,4h] 17.832806
## 16 male ($55k,$95k] (0.25h,4h] 17.789122
## 17 female [$0,$25k] (4h,Infh) 7.670599
## 18 female ($25k,$55k] (4h,Infh) 7.519444
## 19 male [$0,$25k] (4h,Infh) 5.277376
## 20 male ($25k,$55k] (4h,Infh) 5.173382
## 21 female ($95k,$Inf) (4h,Infh) 4.302315
## 22 female ($55k,$95k] (4h,Infh) 4.291776
## 23 male ($95k,$Inf) (4h,Infh) 2.959995
## 24 male ($55k,$95k] (4h,Infh) 2.952744
Chapter 2 - Arguments
Default Arguments:
Passing Arguments Between Functions:
Checking Arguments:
Example code includes:
n_visits <- snake_river_visits$n_visits
summary(n_visits)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 1.00 4.00 24.99 30.00 350.00
# Set the default for n to 5
cut_by_quantile <- function(x, n=5, na.rm, labels, interval_type) {
probs <- seq(0, 1, length.out = n + 1)
qtiles <- quantile(x, probs, na.rm = na.rm, names = FALSE)
right <- switch(interval_type, "(lo, hi]" = TRUE, "[lo, hi)" = FALSE)
cut(x, qtiles, labels = labels, right = right, include.lowest = TRUE)
}
# Remove the n argument from the call
cut_by_quantile(n_visits, na.rm = FALSE,
labels = c("very low", "low", "medium", "high", "very high"), interval_type = "(lo, hi]"
)
## [1] very low very low very low very low very low very low very low
## [8] very low very low very low very low very low very low very low
## [15] very low very low very low very low very low high very high
## [22] high very low medium low very low very low very low
## [29] very low very low very low very high very high very high very high
## [36] very high high very high very high very high very high very high
## [43] medium very high very high very high medium medium low
## [50] high high high very high very high high high
## [57] very high medium very high high medium high very high
## [64] very high very high very high high high very high high
## [71] very low very high high high medium high high
## [78] high medium very high very high very high high high
## [85] high very low very high medium high very high high
## [92] high very high high very low very low medium very low
## [99] medium medium very high medium medium medium high
## [106] low high very high medium very high medium very high
## [113] low very high low very high high very low very low
## [120] very low very low low very low very low very low very low
## [127] very low very low medium very low very low low low
## [134] very low very low low very low very low very low low
## [141] low medium medium medium medium medium very low
## [148] very low low very low low medium very low very low
## [155] very low very low very high high very high high medium
## [162] very high medium very low high medium high high
## [169] very high high high very high very high high very high
## [176] high high medium very high high high high
## [183] very high very high very low high very high high high
## [190] medium very high high very high high very high high
## [197] very high high very high very low high very high very high
## [204] very low very low medium very high medium low medium
## [211] high medium very low medium very high high very high
## [218] high very high high low high medium very high
## [225] medium high high high very low high high
## [232] high very high high medium medium very low very low
## [239] very low very low medium low very low very low very low
## [246] medium high very low very low medium very low very low
## [253] very low very low very low very low very low very low very low
## [260] very low very high medium very low very high medium very high
## [267] medium low very high medium medium medium low
## [274] high medium high very high medium very high very high
## [281] medium medium very high high medium very high high
## [288] medium low very low medium very low very low very low
## [295] very low very low low very low very low very low very low
## [302] very low very low very low very low low very low very low
## [309] very low very low low very low very low low very low
## [316] very low very low very low low very low very low very low
## [323] very low very low low very low very low very low very low
## [330] very low very low very low very low very low very low very low
## [337] very low very low very low very low very low very low very low
## [344] very low very low medium very low very low very low very low
## [351] very low very low very low very low very low very low very low
## [358] very low low very low very low very low very low very low
## [365] very low very low very low very low very low very low low
## [372] very low very low very low very high high very high very high
## [379] very high high very high very high very high very high medium
## [386] medium medium high very high high high high
## [393] high high high high very high high very high
## [400] medium high low high very high low very low
## [407] medium very low medium low
## Levels: very low low medium high very high
# Set the default for na.rm to FALSE
cut_by_quantile <- function(x, n = 5, na.rm=FALSE, labels, interval_type) {
probs <- seq(0, 1, length.out = n + 1)
qtiles <- quantile(x, probs, na.rm = na.rm, names = FALSE)
right <- switch(interval_type, "(lo, hi]" = TRUE, "[lo, hi)" = FALSE)
cut(x, qtiles, labels = labels, right = right, include.lowest = TRUE)
}
# Remove the na.rm argument from the call
cut_by_quantile(n_visits, labels = c("very low", "low", "medium", "high", "very high"),
interval_type = "(lo, hi]"
)
## [1] very low very low very low very low very low very low very low
## [8] very low very low very low very low very low very low very low
## [15] very low very low very low very low very low high very high
## [22] high very low medium low very low very low very low
## [29] very low very low very low very high very high very high very high
## [36] very high high very high very high very high very high very high
## [43] medium very high very high very high medium medium low
## [50] high high high very high very high high high
## [57] very high medium very high high medium high very high
## [64] very high very high very high high high very high high
## [71] very low very high high high medium high high
## [78] high medium very high very high very high high high
## [85] high very low very high medium high very high high
## [92] high very high high very low very low medium very low
## [99] medium medium very high medium medium medium high
## [106] low high very high medium very high medium very high
## [113] low very high low very high high very low very low
## [120] very low very low low very low very low very low very low
## [127] very low very low medium very low very low low low
## [134] very low very low low very low very low very low low
## [141] low medium medium medium medium medium very low
## [148] very low low very low low medium very low very low
## [155] very low very low very high high very high high medium
## [162] very high medium very low high medium high high
## [169] very high high high very high very high high very high
## [176] high high medium very high high high high
## [183] very high very high very low high very high high high
## [190] medium very high high very high high very high high
## [197] very high high very high very low high very high very high
## [204] very low very low medium very high medium low medium
## [211] high medium very low medium very high high very high
## [218] high very high high low high medium very high
## [225] medium high high high very low high high
## [232] high very high high medium medium very low very low
## [239] very low very low medium low very low very low very low
## [246] medium high very low very low medium very low very low
## [253] very low very low very low very low very low very low very low
## [260] very low very high medium very low very high medium very high
## [267] medium low very high medium medium medium low
## [274] high medium high very high medium very high very high
## [281] medium medium very high high medium very high high
## [288] medium low very low medium very low very low very low
## [295] very low very low low very low very low very low very low
## [302] very low very low very low very low low very low very low
## [309] very low very low low very low very low low very low
## [316] very low very low very low low very low very low very low
## [323] very low very low low very low very low very low very low
## [330] very low very low very low very low very low very low very low
## [337] very low very low very low very low very low very low very low
## [344] very low very low medium very low very low very low very low
## [351] very low very low very low very low very low very low very low
## [358] very low low very low very low very low very low very low
## [365] very low very low very low very low very low very low low
## [372] very low very low very low very high high very high very high
## [379] very high high very high very high very high very high medium
## [386] medium medium high very high high high high
## [393] high high high high very high high very high
## [400] medium high low high very high low very low
## [407] medium very low medium low
## Levels: very low low medium high very high
# Set the default for labels to NULL
cut_by_quantile <- function(x, n = 5, na.rm = FALSE, labels=NULL, interval_type) {
probs <- seq(0, 1, length.out = n + 1)
qtiles <- quantile(x, probs, na.rm = na.rm, names = FALSE)
right <- switch(interval_type, "(lo, hi]" = TRUE, "[lo, hi)" = FALSE)
cut(x, qtiles, labels = labels, right = right, include.lowest = TRUE)
}
# Remove the labels argument from the call
cut_by_quantile(n_visits, interval_type = "(lo, hi]")
## [1] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1]
## [9] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1]
## [17] [0,1] [0,1] [0,1] (10,35] (35,350] (10,35] [0,1] (2,10]
## [25] (1,2] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1] (35,350]
## [33] (35,350] (35,350] (35,350] (35,350] (10,35] (35,350] (35,350] (35,350]
## [41] (35,350] (35,350] (2,10] (35,350] (35,350] (35,350] (2,10] (2,10]
## [49] (1,2] (10,35] (10,35] (10,35] (35,350] (35,350] (10,35] (10,35]
## [57] (35,350] (2,10] (35,350] (10,35] (2,10] (10,35] (35,350] (35,350]
## [65] (35,350] (35,350] (10,35] (10,35] (35,350] (10,35] [0,1] (35,350]
## [73] (10,35] (10,35] (2,10] (10,35] (10,35] (10,35] (2,10] (35,350]
## [81] (35,350] (35,350] (10,35] (10,35] (10,35] [0,1] (35,350] (2,10]
## [89] (10,35] (35,350] (10,35] (10,35] (35,350] (10,35] [0,1] [0,1]
## [97] (2,10] [0,1] (2,10] (2,10] (35,350] (2,10] (2,10] (2,10]
## [105] (10,35] (1,2] (10,35] (35,350] (2,10] (35,350] (2,10] (35,350]
## [113] (1,2] (35,350] (1,2] (35,350] (10,35] [0,1] [0,1] [0,1]
## [121] [0,1] (1,2] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1]
## [129] (2,10] [0,1] [0,1] (1,2] (1,2] [0,1] [0,1] (1,2]
## [137] [0,1] [0,1] [0,1] (1,2] (1,2] (2,10] (2,10] (2,10]
## [145] (2,10] (2,10] [0,1] [0,1] (1,2] [0,1] (1,2] (2,10]
## [153] [0,1] [0,1] [0,1] [0,1] (35,350] (10,35] (35,350] (10,35]
## [161] (2,10] (35,350] (2,10] [0,1] (10,35] (2,10] (10,35] (10,35]
## [169] (35,350] (10,35] (10,35] (35,350] (35,350] (10,35] (35,350] (10,35]
## [177] (10,35] (2,10] (35,350] (10,35] (10,35] (10,35] (35,350] (35,350]
## [185] [0,1] (10,35] (35,350] (10,35] (10,35] (2,10] (35,350] (10,35]
## [193] (35,350] (10,35] (35,350] (10,35] (35,350] (10,35] (35,350] [0,1]
## [201] (10,35] (35,350] (35,350] [0,1] [0,1] (2,10] (35,350] (2,10]
## [209] (1,2] (2,10] (10,35] (2,10] [0,1] (2,10] (35,350] (10,35]
## [217] (35,350] (10,35] (35,350] (10,35] (1,2] (10,35] (2,10] (35,350]
## [225] (2,10] (10,35] (10,35] (10,35] [0,1] (10,35] (10,35] (10,35]
## [233] (35,350] (10,35] (2,10] (2,10] [0,1] [0,1] [0,1] [0,1]
## [241] (2,10] (1,2] [0,1] [0,1] [0,1] (2,10] (10,35] [0,1]
## [249] [0,1] (2,10] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1]
## [257] [0,1] [0,1] [0,1] [0,1] (35,350] (2,10] [0,1] (35,350]
## [265] (2,10] (35,350] (2,10] (1,2] (35,350] (2,10] (2,10] (2,10]
## [273] (1,2] (10,35] (2,10] (10,35] (35,350] (2,10] (35,350] (35,350]
## [281] (2,10] (2,10] (35,350] (10,35] (2,10] (35,350] (10,35] (2,10]
## [289] (1,2] [0,1] (2,10] [0,1] [0,1] [0,1] [0,1] [0,1]
## [297] (1,2] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1]
## [305] [0,1] (1,2] [0,1] [0,1] [0,1] [0,1] (1,2] [0,1]
## [313] [0,1] (1,2] [0,1] [0,1] [0,1] [0,1] (1,2] [0,1]
## [321] [0,1] [0,1] [0,1] [0,1] (1,2] [0,1] [0,1] [0,1]
## [329] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1]
## [337] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1]
## [345] [0,1] (2,10] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1]
## [353] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1] (1,2] [0,1]
## [361] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1]
## [369] [0,1] [0,1] (1,2] [0,1] [0,1] [0,1] (35,350] (10,35]
## [377] (35,350] (35,350] (35,350] (10,35] (35,350] (35,350] (35,350] (35,350]
## [385] (2,10] (2,10] (2,10] (10,35] (35,350] (10,35] (10,35] (10,35]
## [393] (10,35] (10,35] (10,35] (10,35] (35,350] (10,35] (35,350] (2,10]
## [401] (10,35] (1,2] (10,35] (35,350] (1,2] [0,1] (2,10] [0,1]
## [409] (2,10] (1,2]
## Levels: [0,1] (1,2] (2,10] (10,35] (35,350]
# Set the categories for interval_type to "(lo, hi]" and "[lo, hi)"
cut_by_quantile <- function(x, n = 5, na.rm = FALSE, labels = NULL,
interval_type=c("(lo, hi]", "[lo, hi)")
) {
# Match the interval_type argument
interval_type <- match.arg(interval_type, c("(lo, hi]", "[lo, hi)"))
probs <- seq(0, 1, length.out = n + 1)
qtiles <- quantile(x, probs, na.rm = na.rm, names = FALSE)
right <- switch(interval_type, "(lo, hi]" = TRUE, "[lo, hi)" = FALSE)
cut(x, qtiles, labels = labels, right = right, include.lowest = TRUE)
}
# Remove the interval_type argument from the call
cut_by_quantile(n_visits)
## [1] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1]
## [9] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1]
## [17] [0,1] [0,1] [0,1] (10,35] (35,350] (10,35] [0,1] (2,10]
## [25] (1,2] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1] (35,350]
## [33] (35,350] (35,350] (35,350] (35,350] (10,35] (35,350] (35,350] (35,350]
## [41] (35,350] (35,350] (2,10] (35,350] (35,350] (35,350] (2,10] (2,10]
## [49] (1,2] (10,35] (10,35] (10,35] (35,350] (35,350] (10,35] (10,35]
## [57] (35,350] (2,10] (35,350] (10,35] (2,10] (10,35] (35,350] (35,350]
## [65] (35,350] (35,350] (10,35] (10,35] (35,350] (10,35] [0,1] (35,350]
## [73] (10,35] (10,35] (2,10] (10,35] (10,35] (10,35] (2,10] (35,350]
## [81] (35,350] (35,350] (10,35] (10,35] (10,35] [0,1] (35,350] (2,10]
## [89] (10,35] (35,350] (10,35] (10,35] (35,350] (10,35] [0,1] [0,1]
## [97] (2,10] [0,1] (2,10] (2,10] (35,350] (2,10] (2,10] (2,10]
## [105] (10,35] (1,2] (10,35] (35,350] (2,10] (35,350] (2,10] (35,350]
## [113] (1,2] (35,350] (1,2] (35,350] (10,35] [0,1] [0,1] [0,1]
## [121] [0,1] (1,2] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1]
## [129] (2,10] [0,1] [0,1] (1,2] (1,2] [0,1] [0,1] (1,2]
## [137] [0,1] [0,1] [0,1] (1,2] (1,2] (2,10] (2,10] (2,10]
## [145] (2,10] (2,10] [0,1] [0,1] (1,2] [0,1] (1,2] (2,10]
## [153] [0,1] [0,1] [0,1] [0,1] (35,350] (10,35] (35,350] (10,35]
## [161] (2,10] (35,350] (2,10] [0,1] (10,35] (2,10] (10,35] (10,35]
## [169] (35,350] (10,35] (10,35] (35,350] (35,350] (10,35] (35,350] (10,35]
## [177] (10,35] (2,10] (35,350] (10,35] (10,35] (10,35] (35,350] (35,350]
## [185] [0,1] (10,35] (35,350] (10,35] (10,35] (2,10] (35,350] (10,35]
## [193] (35,350] (10,35] (35,350] (10,35] (35,350] (10,35] (35,350] [0,1]
## [201] (10,35] (35,350] (35,350] [0,1] [0,1] (2,10] (35,350] (2,10]
## [209] (1,2] (2,10] (10,35] (2,10] [0,1] (2,10] (35,350] (10,35]
## [217] (35,350] (10,35] (35,350] (10,35] (1,2] (10,35] (2,10] (35,350]
## [225] (2,10] (10,35] (10,35] (10,35] [0,1] (10,35] (10,35] (10,35]
## [233] (35,350] (10,35] (2,10] (2,10] [0,1] [0,1] [0,1] [0,1]
## [241] (2,10] (1,2] [0,1] [0,1] [0,1] (2,10] (10,35] [0,1]
## [249] [0,1] (2,10] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1]
## [257] [0,1] [0,1] [0,1] [0,1] (35,350] (2,10] [0,1] (35,350]
## [265] (2,10] (35,350] (2,10] (1,2] (35,350] (2,10] (2,10] (2,10]
## [273] (1,2] (10,35] (2,10] (10,35] (35,350] (2,10] (35,350] (35,350]
## [281] (2,10] (2,10] (35,350] (10,35] (2,10] (35,350] (10,35] (2,10]
## [289] (1,2] [0,1] (2,10] [0,1] [0,1] [0,1] [0,1] [0,1]
## [297] (1,2] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1]
## [305] [0,1] (1,2] [0,1] [0,1] [0,1] [0,1] (1,2] [0,1]
## [313] [0,1] (1,2] [0,1] [0,1] [0,1] [0,1] (1,2] [0,1]
## [321] [0,1] [0,1] [0,1] [0,1] (1,2] [0,1] [0,1] [0,1]
## [329] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1]
## [337] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1]
## [345] [0,1] (2,10] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1]
## [353] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1] (1,2] [0,1]
## [361] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1]
## [369] [0,1] [0,1] (1,2] [0,1] [0,1] [0,1] (35,350] (10,35]
## [377] (35,350] (35,350] (35,350] (10,35] (35,350] (35,350] (35,350] (35,350]
## [385] (2,10] (2,10] (2,10] (10,35] (35,350] (10,35] (10,35] (10,35]
## [393] (10,35] (10,35] (10,35] (10,35] (35,350] (10,35] (35,350] (2,10]
## [401] (10,35] (1,2] (10,35] (35,350] (1,2] [0,1] (2,10] [0,1]
## [409] (2,10] (1,2]
## Levels: [0,1] (1,2] (2,10] (10,35] (35,350]
std_and_poor500 <- readRDS("./RInputFiles/std_and_poor500_with_pe_2019-06-21.rds")
glimpse(std_and_poor500)
## Observations: 505
## Variables: 5
## $ symbol <chr> "MMM", "ABT", "ABBV", "ABMD", "ACN", "ATVI", "ADBE", "AMD"...
## $ company <chr> "3M Company", "Abbott Laboratories", "AbbVie Inc.", "ABIOM...
## $ sector <chr> "Industrials", "Health Care", "Health Care", "Health Care"...
## $ industry <chr> "Industrial Conglomerates", "Health Care Equipment", "Phar...
## $ pe_ratio <dbl> 18.31678, 57.66621, 22.43805, 45.63993, 27.00233, 20.13596...
# From previous steps
get_reciprocal <- function(x) {
1 / x
}
calc_harmonic_mean <- function(x) {
x %>%
get_reciprocal() %>%
mean(na.rm=TRUE) %>%
get_reciprocal()
}
std_and_poor500 %>%
# Group by sector
group_by(sector) %>%
# Summarize, calculating harmonic mean of P/E ratio
summarize(hmean_pe_ratio = calc_harmonic_mean(pe_ratio))
## # A tibble: 11 x 2
## sector hmean_pe_ratio
## <chr> <dbl>
## 1 Communication Services 17.5
## 2 Consumer Discretionary 15.2
## 3 Consumer Staples 19.8
## 4 Energy 13.7
## 5 Financials 12.9
## 6 Health Care 26.6
## 7 Industrials 18.2
## 8 Information Technology 21.6
## 9 Materials 16.3
## 10 Real Estate 32.5
## 11 Utilities 23.9
# From previous step
calc_harmonic_mean <- function(x, na.rm = FALSE) {
x %>%
get_reciprocal() %>%
mean(na.rm = na.rm) %>%
get_reciprocal()
}
std_and_poor500 %>%
# Group by sector
group_by(sector) %>%
# Summarize, calculating harmonic mean of P/E ratio
summarize(hmean_pe_ratio = calc_harmonic_mean(pe_ratio, na.rm=TRUE))
## # A tibble: 11 x 2
## sector hmean_pe_ratio
## <chr> <dbl>
## 1 Communication Services 17.5
## 2 Consumer Discretionary 15.2
## 3 Consumer Staples 19.8
## 4 Energy 13.7
## 5 Financials 12.9
## 6 Health Care 26.6
## 7 Industrials 18.2
## 8 Information Technology 21.6
## 9 Materials 16.3
## 10 Real Estate 32.5
## 11 Utilities 23.9
calc_harmonic_mean <- function(x, ...) {
x %>%
get_reciprocal() %>%
mean(...) %>%
get_reciprocal()
}
std_and_poor500 %>%
# Group by sector
group_by(sector) %>%
# Summarize, calculating harmonic mean of P/E ratio
summarize(hmean_pe_ratio=calc_harmonic_mean(pe_ratio, na.rm=TRUE))
## # A tibble: 11 x 2
## sector hmean_pe_ratio
## <chr> <dbl>
## 1 Communication Services 17.5
## 2 Consumer Discretionary 15.2
## 3 Consumer Staples 19.8
## 4 Energy 13.7
## 5 Financials 12.9
## 6 Health Care 26.6
## 7 Industrials 18.2
## 8 Information Technology 21.6
## 9 Materials 16.3
## 10 Real Estate 32.5
## 11 Utilities 23.9
calc_harmonic_mean <- function(x, na.rm = FALSE) {
# Assert that x is numeric
assertive.types::assert_is_numeric(x)
x %>%
get_reciprocal() %>%
mean(na.rm = na.rm) %>%
get_reciprocal()
}
# See what happens when you pass it strings (bombs out, as it should)
# calc_harmonic_mean(std_and_poor500$sector)
calc_harmonic_mean <- function(x, na.rm = FALSE) {
assertive.types::assert_is_numeric(x)
# Check if any values of x are non-positive
if(any(assertive.numbers::is_non_positive(x), na.rm = TRUE)) {
# Throw an error
stop("x contains non-positive values, so the harmonic mean makes no sense.")
}
x %>%
get_reciprocal() %>%
mean(na.rm = na.rm) %>%
get_reciprocal()
}
# See what happens when you pass it negative numbers (bombs out as it should)
# calc_harmonic_mean(std_and_poor500$pe_ratio - 20)
# Update the function definition to fix the na.rm argument
calc_harmonic_mean <- function(x, na.rm = FALSE) {
assertive.types::assert_is_numeric(x)
if(any(assertive.numbers::is_non_positive(x), na.rm = TRUE)) {
stop("x contains non-positive values, so the harmonic mean makes no sense.")
}
# Use the first value of na.rm, and coerce to logical
na.rm <- assertive.base::coerce_to(assertive.base::use_first(na.rm), "logical")
x %>%
get_reciprocal() %>%
mean(na.rm = na.rm) %>%
get_reciprocal()
}
# See what happens when you pass it malformed na.rm
calc_harmonic_mean(std_and_poor500$pe_ratio, na.rm = 1:5)
## Warning: Only the first value of na.rm (= 1) will be used.
## Warning: Coercing assertive.base::use_first(na.rm) to class 'logical'.
## [1] 18.23871
Chapter 3 - Return Values and Scope
Returning Values from Functions:
Returning Multiple Values from Functions:
Environments:
Scope and Precedence:
Example code includes:
is_leap_year <- function(year) {
# If year is div. by 400 return TRUE
if(year %% 400 == 0) {
return(TRUE)
}
# If year is div. by 100 return FALSE
if(year %% 100 == 0) {
return(FALSE)
}
# If year is div. by 4 return TRUE
if(year %% 4 == 0) {
return(TRUE)
}
# Otherwise return FALSE
return(FALSE)
}
cars <- data.frame(speed=c(4, 4, 7, 7, 8, 9, 10, 10, 10, 11, 11, 12, 12, 12, 12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 16, 16, 17, 17, 17, 18, 18, 18, 18, 19, 19, 19, 20, 20, 20, 20, 20, 22, 23, 24, 24, 24, 24, 25),
dist=c(2, 10, 4, 22, 16, 10, 18, 26, 34, 17, 28, 14, 20, 24, 28, 26, 34, 34, 46, 26, 36, 60, 80, 20, 26, 54, 32, 40, 32, 40, 50, 42, 56, 76, 84, 36, 46, 68, 32, 48, 52, 56, 64, 66, 54, 70, 92, 93, 120, 85)
)
str(cars)
## 'data.frame': 50 obs. of 2 variables:
## $ speed: num 4 4 7 7 8 9 10 10 10 11 ...
## $ dist : num 2 10 4 22 16 10 18 26 34 17 ...
# Using cars, draw a scatter plot of dist vs. speed
plt_dist_vs_speed <- plot(dist ~ speed, data = cars)
# Oh no! The plot object is NULL
plt_dist_vs_speed
## NULL
# Define a scatter plot fn with data and formula args
pipeable_plot <- function(data, formula) {
# Call plot() with the formula interface
plot(formula, data)
# Invisibly return the input dataset
invisible(data)
}
# Draw the scatter plot of dist vs. speed again
plt_dist_vs_speed <- cars %>%
pipeable_plot(dist ~ speed)
# Now the plot object has a value
plt_dist_vs_speed
## speed dist
## 1 4 2
## 2 4 10
## 3 7 4
## 4 7 22
## 5 8 16
## 6 9 10
## 7 10 18
## 8 10 26
## 9 10 34
## 10 11 17
## 11 11 28
## 12 12 14
## 13 12 20
## 14 12 24
## 15 12 28
## 16 13 26
## 17 13 34
## 18 13 34
## 19 13 46
## 20 14 26
## 21 14 36
## 22 14 60
## 23 14 80
## 24 15 20
## 25 15 26
## 26 15 54
## 27 16 32
## 28 16 40
## 29 17 32
## 30 17 40
## 31 17 50
## 32 18 42
## 33 18 56
## 34 18 76
## 35 18 84
## 36 19 36
## 37 19 46
## 38 19 68
## 39 20 32
## 40 20 48
## 41 20 52
## 42 20 56
## 43 20 64
## 44 22 66
## 45 23 54
## 46 24 70
## 47 24 92
## 48 24 93
## 49 24 120
## 50 25 85
# Look at the structure of model (it's a mess!)
str(model)
## List of 31
## $ coefficients : Named num [1:7] 4.0864 0.374 -0.0199 -0.5807 -0.5782 ...
## ..- attr(*, "names")= chr [1:7] "(Intercept)" "genderfemale" "income($25k,$55k]" "income($55k,$95k]" ...
## $ residuals : Named num [1:346] -0.535 -0.768 -0.944 -0.662 -0.767 ...
## ..- attr(*, "names")= chr [1:346] "25" "26" "27" "29" ...
## $ fitted.values : Named num [1:346] 4.3 4.3 17.83 2.96 4.29 ...
## ..- attr(*, "names")= chr [1:346] "25" "26" "27" "29" ...
## $ effects : Named num [1:346] -360 -29.2 20.3 -10 23.4 ...
## ..- attr(*, "names")= chr [1:346] "(Intercept)" "genderfemale" "income($25k,$55k]" "income($55k,$95k]" ...
## $ R : num [1:7, 1:7] -97.4 0 0 0 0 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:7] "(Intercept)" "genderfemale" "income($25k,$55k]" "income($55k,$95k]" ...
## .. ..$ : chr [1:7] "(Intercept)" "genderfemale" "income($25k,$55k]" "income($55k,$95k]" ...
## $ rank : int 7
## $ qr :List of 5
## ..$ qr : num [1:346, 1:7] -97.3861 0.0213 0.0434 0.0177 0.0213 ...
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ : chr [1:346] "25" "26" "27" "29" ...
## .. .. ..$ : chr [1:7] "(Intercept)" "genderfemale" "income($25k,$55k]" "income($55k,$95k]" ...
## ..$ rank : int 7
## ..$ qraux: num [1:7] 1.02 1.02 1.04 1.01 1 ...
## ..$ pivot: int [1:7] 1 2 3 4 5 6 7
## ..$ tol : num 1e-11
## ..- attr(*, "class")= chr "qr"
## $ family :List of 12
## ..$ family : chr "poisson"
## ..$ link : chr "log"
## ..$ linkfun :function (mu)
## ..$ linkinv :function (eta)
## ..$ variance :function (mu)
## ..$ dev.resids:function (y, mu, wt)
## ..$ aic :function (y, n, mu, wt, dev)
## ..$ mu.eta :function (eta)
## ..$ initialize: expression({ if (any(y < 0)) stop("negative values not allowed for the 'Poisson' family") n <- rep.int(1, nobs| __truncated__
## ..$ validmu :function (mu)
## ..$ valideta :function (eta)
## ..$ simulate :function (object, nsim)
## ..- attr(*, "class")= chr "family"
## $ linear.predictors: Named num [1:346] 1.46 1.46 2.88 1.09 1.46 ...
## ..- attr(*, "names")= chr [1:346] "25" "26" "27" "29" ...
## $ deviance : num 11529
## $ aic : num 12864
## $ null.deviance : num 18850
## $ iter : int 6
## $ weights : Named num [1:346] 4.3 4.3 17.83 2.96 4.29 ...
## ..- attr(*, "names")= chr [1:346] "25" "26" "27" "29" ...
## $ prior.weights : Named num [1:346] 1 1 1 1 1 1 1 1 1 1 ...
## ..- attr(*, "names")= chr [1:346] "25" "26" "27" "29" ...
## $ df.residual : int 339
## $ df.null : int 345
## $ y : Named num [1:346] 2 1 1 1 1 1 80 104 55 350 ...
## ..- attr(*, "names")= chr [1:346] "25" "26" "27" "29" ...
## $ converged : logi TRUE
## $ boundary : logi FALSE
## $ model :'data.frame': 346 obs. of 4 variables:
## ..$ n_visits: num [1:346] 2 1 1 1 1 1 80 104 55 350 ...
## ..$ gender : Factor w/ 2 levels "male","female": 2 2 1 1 2 1 2 2 1 2 ...
## ..$ income : Factor w/ 4 levels "[$0,$25k]","($25k,$55k]",..: 4 4 4 4 3 1 1 4 2 2 ...
## ..$ travel : Factor w/ 3 levels "[0h,0.25h]","(0.25h,4h]",..: 3 3 2 3 3 1 1 1 2 1 ...
## ..- attr(*, "terms")=Classes 'terms', 'formula' language n_visits ~ gender + income + travel
## .. .. ..- attr(*, "variables")= language list(n_visits, gender, income, travel)
## .. .. ..- attr(*, "factors")= int [1:4, 1:3] 0 1 0 0 0 0 1 0 0 0 ...
## .. .. .. ..- attr(*, "dimnames")=List of 2
## .. .. .. .. ..$ : chr [1:4] "n_visits" "gender" "income" "travel"
## .. .. .. .. ..$ : chr [1:3] "gender" "income" "travel"
## .. .. ..- attr(*, "term.labels")= chr [1:3] "gender" "income" "travel"
## .. .. ..- attr(*, "order")= int [1:3] 1 1 1
## .. .. ..- attr(*, "intercept")= int 1
## .. .. ..- attr(*, "response")= int 1
## .. .. ..- attr(*, ".Environment")=<environment: 0x000000001d3bf1a0>
## .. .. ..- attr(*, "predvars")= language list(n_visits, gender, income, travel)
## .. .. ..- attr(*, "dataClasses")= Named chr [1:4] "numeric" "factor" "factor" "factor"
## .. .. .. ..- attr(*, "names")= chr [1:4] "n_visits" "gender" "income" "travel"
## ..- attr(*, "na.action")= 'omit' Named int [1:64] 1 2 3 4 5 6 7 8 9 10 ...
## .. ..- attr(*, "names")= chr [1:64] "1" "2" "3" "4" ...
## $ na.action : 'omit' Named int [1:64] 1 2 3 4 5 6 7 8 9 10 ...
## ..- attr(*, "names")= chr [1:64] "1" "2" "3" "4" ...
## $ call : language glm(formula = formula, family = poisson, data = data)
## $ formula :Class 'formula' language n_visits ~ gender + income + travel
## .. ..- attr(*, ".Environment")=<environment: 0x000000001d3bf1a0>
## $ terms :Classes 'terms', 'formula' language n_visits ~ gender + income + travel
## .. ..- attr(*, "variables")= language list(n_visits, gender, income, travel)
## .. ..- attr(*, "factors")= int [1:4, 1:3] 0 1 0 0 0 0 1 0 0 0 ...
## .. .. ..- attr(*, "dimnames")=List of 2
## .. .. .. ..$ : chr [1:4] "n_visits" "gender" "income" "travel"
## .. .. .. ..$ : chr [1:3] "gender" "income" "travel"
## .. ..- attr(*, "term.labels")= chr [1:3] "gender" "income" "travel"
## .. ..- attr(*, "order")= int [1:3] 1 1 1
## .. ..- attr(*, "intercept")= int 1
## .. ..- attr(*, "response")= int 1
## .. ..- attr(*, ".Environment")=<environment: 0x000000001d3bf1a0>
## .. ..- attr(*, "predvars")= language list(n_visits, gender, income, travel)
## .. ..- attr(*, "dataClasses")= Named chr [1:4] "numeric" "factor" "factor" "factor"
## .. .. ..- attr(*, "names")= chr [1:4] "n_visits" "gender" "income" "travel"
## $ data :'data.frame': 410 obs. of 4 variables:
## ..$ n_visits: num [1:410] 0 0 0 0 0 0 0 0 0 0 ...
## ..$ gender : Factor w/ 2 levels "male","female": 1 1 1 2 1 2 2 2 1 1 ...
## ..$ income : Factor w/ 4 levels "[$0,$25k]","($25k,$55k]",..: 4 2 4 2 4 2 4 4 4 4 ...
## ..$ travel : Factor w/ 3 levels "[0h,0.25h]","(0.25h,4h]",..: NA NA NA NA NA NA NA NA NA NA ...
## $ offset : NULL
## $ control :List of 3
## ..$ epsilon: num 1e-08
## ..$ maxit : num 25
## ..$ trace : logi FALSE
## $ method : chr "glm.fit"
## $ contrasts :List of 3
## ..$ gender: chr "contr.treatment"
## ..$ income: chr "contr.treatment"
## ..$ travel: chr "contr.treatment"
## $ xlevels :List of 3
## ..$ gender: chr [1:2] "male" "female"
## ..$ income: chr [1:4] "[$0,$25k]" "($25k,$55k]" "($55k,$95k]" "($95k,$Inf)"
## ..$ travel: chr [1:3] "[0h,0.25h]" "(0.25h,4h]" "(4h,Infh)"
## - attr(*, "class")= chr [1:2] "glm" "lm"
# Use broom tools to get a list of 3 data frames
list(
# Get model-level values
model = broom::glance(model),
# Get coefficient-level values
coefficients = broom::tidy(model),
# Get observation-level values
observations = broom::augment(model)
)
## $model
## # A tibble: 1 x 7
## null.deviance df.null logLik AIC BIC deviance df.residual
## <dbl> <int> <dbl> <dbl> <dbl> <dbl> <int>
## 1 18850. 345 -6425. 12864. 12891. 11529. 339
##
## $coefficients
## # A tibble: 7 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 4.09 0.0279 146. 0.
## 2 genderfemale 0.374 0.0212 17.6 2.18e- 69
## 3 income($25k,$55k] -0.0199 0.0267 -0.746 4.56e- 1
## 4 income($55k,$95k] -0.581 0.0343 -16.9 3.28e- 64
## 5 income($95k,$Inf) -0.578 0.0310 -18.7 6.88e- 78
## 6 travel(0.25h,4h] -0.627 0.0217 -28.8 5.40e-183
## 7 travel(4h,Infh) -2.42 0.0492 -49.3 0.
##
## $observations
## # A tibble: 346 x 12
## .rownames n_visits gender income travel .fitted .se.fit .resid .hat
## <chr> <dbl> <fct> <fct> <fct> <dbl> <dbl> <dbl> <dbl>
## 1 25 2 female ($95k~ (4h,I~ 1.46 0.0502 -1.24 0.0109
## 2 26 1 female ($95k~ (4h,I~ 1.46 0.0502 -1.92 0.0109
## 3 27 1 male ($95k~ (0.25~ 2.88 0.0269 -5.28 0.0129
## 4 29 1 male ($95k~ (4h,I~ 1.09 0.0490 -1.32 0.00711
## 5 30 1 female ($55k~ (4h,I~ 1.46 0.0531 -1.92 0.0121
## 6 31 1 male [$0,$~ [0h,0~ 4.09 0.0279 -10.4 0.0465
## 7 33 80 female [$0,$~ [0h,0~ 4.46 0.0235 -0.710 0.0479
## 8 34 104 female ($95k~ [0h,0~ 3.88 0.0261 6.90 0.0332
## 9 35 55 male ($25k~ (0.25~ 3.44 0.0222 3.85 0.0153
## 10 36 350 female ($25k~ [0h,0~ 4.44 0.0206 21.5 0.0360
## # ... with 336 more rows, and 3 more variables: .sigma <dbl>, .cooksd <dbl>,
## # .std.resid <dbl>
# From previous step
groom_model <- function(model) {
list(
model = broom::glance(model),
coefficients = broom::tidy(model),
observations = broom::augment(model)
)
}
library(zeallot) # needed for %<-%
# Call groom_model on model, assigning to 3 variables
c(mdl, cff, obs) %<-% groom_model(model)
# See these individual variables
mdl; cff; obs
## # A tibble: 1 x 7
## null.deviance df.null logLik AIC BIC deviance df.residual
## <dbl> <int> <dbl> <dbl> <dbl> <dbl> <int>
## 1 18850. 345 -6425. 12864. 12891. 11529. 339
## # A tibble: 7 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 4.09 0.0279 146. 0.
## 2 genderfemale 0.374 0.0212 17.6 2.18e- 69
## 3 income($25k,$55k] -0.0199 0.0267 -0.746 4.56e- 1
## 4 income($55k,$95k] -0.581 0.0343 -16.9 3.28e- 64
## 5 income($95k,$Inf) -0.578 0.0310 -18.7 6.88e- 78
## 6 travel(0.25h,4h] -0.627 0.0217 -28.8 5.40e-183
## 7 travel(4h,Infh) -2.42 0.0492 -49.3 0.
## # A tibble: 346 x 12
## .rownames n_visits gender income travel .fitted .se.fit .resid .hat
## <chr> <dbl> <fct> <fct> <fct> <dbl> <dbl> <dbl> <dbl>
## 1 25 2 female ($95k~ (4h,I~ 1.46 0.0502 -1.24 0.0109
## 2 26 1 female ($95k~ (4h,I~ 1.46 0.0502 -1.92 0.0109
## 3 27 1 male ($95k~ (0.25~ 2.88 0.0269 -5.28 0.0129
## 4 29 1 male ($95k~ (4h,I~ 1.09 0.0490 -1.32 0.00711
## 5 30 1 female ($55k~ (4h,I~ 1.46 0.0531 -1.92 0.0121
## 6 31 1 male [$0,$~ [0h,0~ 4.09 0.0279 -10.4 0.0465
## 7 33 80 female [$0,$~ [0h,0~ 4.46 0.0235 -0.710 0.0479
## 8 34 104 female ($95k~ [0h,0~ 3.88 0.0261 6.90 0.0332
## 9 35 55 male ($25k~ (0.25~ 3.44 0.0222 3.85 0.0153
## 10 36 350 female ($25k~ [0h,0~ 4.44 0.0206 21.5 0.0360
## # ... with 336 more rows, and 3 more variables: .sigma <dbl>, .cooksd <dbl>,
## # .std.resid <dbl>
pipeable_plot <- function(data, formula) {
plot(formula, data)
# Add a "formula" attribute to data
attr(data, "formula") <- formula
invisible(data)
}
# From previous exercise
plt_dist_vs_speed <- cars %>%
pipeable_plot(dist ~ speed)
# Examine the structure of the result
str(plt_dist_vs_speed)
## 'data.frame': 50 obs. of 2 variables:
## $ speed: num 4 4 7 7 8 9 10 10 10 11 ...
## $ dist : num 2 10 4 22 16 10 18 26 34 17 ...
## - attr(*, "formula")=Class 'formula' language dist ~ speed
## .. ..- attr(*, ".Environment")=<environment: 0x000000001c3d44f0>
capitals <- tibble::tibble(city=c("Cape Town", "Bloemfontein", "Pretoria"),
type_of_capital=c("Legislative", "Judicial", "Administrative")
)
national_parks <- c('Addo Elephant National Park', 'Agulhas National Park', 'Ai-Ais/Richtersveld Transfrontier Park', 'Augrabies Falls National Park', 'Bontebok National Park', 'Camdeboo National Park', 'Golden Gate Highlands National Park', 'Hluhluwe–Imfolozi Park', 'Karoo National Park', 'Kgalagadi Transfrontier Park', 'Knysna National Lake Area', 'Kruger National Park', 'Mapungubwe National Park', 'Marakele National Park', 'Mokala National Park', 'Mountain Zebra National Park', 'Namaqua National Park', 'Table Mountain National Park', 'Tankwa Karoo National Park', 'Tsitsikamma National Park', 'West Coast National Park', 'Wilderness National Park')
population <- ts(c(40583573, 44819778, 47390900, 51770560, 55908900), start=1996, end=2016, deltat=5)
capitals
## # A tibble: 3 x 2
## city type_of_capital
## <chr> <chr>
## 1 Cape Town Legislative
## 2 Bloemfontein Judicial
## 3 Pretoria Administrative
national_parks
## [1] "Addo Elephant National Park"
## [2] "Agulhas National Park"
## [3] "Ai-Ais/Richtersveld Transfrontier Park"
## [4] "Augrabies Falls National Park"
## [5] "Bontebok National Park"
## [6] "Camdeboo National Park"
## [7] "Golden Gate Highlands National Park"
## [8] "Hluhluwe–Imfolozi Park"
## [9] "Karoo National Park"
## [10] "Kgalagadi Transfrontier Park"
## [11] "Knysna National Lake Area"
## [12] "Kruger National Park"
## [13] "Mapungubwe National Park"
## [14] "Marakele National Park"
## [15] "Mokala National Park"
## [16] "Mountain Zebra National Park"
## [17] "Namaqua National Park"
## [18] "Table Mountain National Park"
## [19] "Tankwa Karoo National Park"
## [20] "Tsitsikamma National Park"
## [21] "West Coast National Park"
## [22] "Wilderness National Park"
population
## Time Series:
## Start = 1996
## End = 2016
## Frequency = 0.2
## [1] 40583573 44819778 47390900 51770560 55908900
# From previous steps
rsa_lst <- list(
capitals = capitals,
national_parks = national_parks,
population = population
)
rsa_env <- list2env(rsa_lst)
ls.str(rsa_lst)
## capitals : Classes 'tbl_df', 'tbl' and 'data.frame': 3 obs. of 2 variables:
## $ city : chr "Cape Town" "Bloemfontein" "Pretoria"
## $ type_of_capital: chr "Legislative" "Judicial" "Administrative"
## national_parks : chr [1:22] "Addo Elephant National Park" "Agulhas National Park" ...
## population : Time-Series [1:5] from 1996 to 2016: 40583573 44819778 47390900 51770560 55908900
ls.str(rsa_env)
## capitals : Classes 'tbl_df', 'tbl' and 'data.frame': 3 obs. of 2 variables:
## $ city : chr "Cape Town" "Bloemfontein" "Pretoria"
## $ type_of_capital: chr "Legislative" "Judicial" "Administrative"
## national_parks : chr [1:22] "Addo Elephant National Park" "Agulhas National Park" ...
## population : Time-Series [1:5] from 1996 to 2016: 40583573 44819778 47390900 51770560 55908900
# Find the parent environment of rsa_env
parent <- parent.env(rsa_env)
# Print its name
environmentName(parent)
## [1] "R_GlobalEnv"
# Compare the contents of the global environment and rsa_env
# ls.str(globalenv())
ls.str(rsa_env)
## capitals : Classes 'tbl_df', 'tbl' and 'data.frame': 3 obs. of 2 variables:
## $ city : chr "Cape Town" "Bloemfontein" "Pretoria"
## $ type_of_capital: chr "Legislative" "Judicial" "Administrative"
## national_parks : chr [1:22] "Addo Elephant National Park" "Agulhas National Park" ...
## population : Time-Series [1:5] from 1996 to 2016: 40583573 44819778 47390900 51770560 55908900
# Does population exist in rsa_env?
exists("population", envir = rsa_env)
## [1] TRUE
# Does population exist in rsa_env, ignoring inheritance?
exists("population", envir = rsa_env, inherits=FALSE)
## [1] TRUE
Chapter 4 - Case Study on Grain Yields
Grain Yields and Conversion:
Visualizing Grain Yields:
Modeling Grain Yields:
Wrap Up:
Example code includes:
library(magrittr)
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
corn <- readRDS("./RInputFiles/nass.corn.rds")
wheat <- readRDS("./RInputFiles/nass.wheat.rds")
barley <- readRDS("./RInputFiles/nass.barley.rds")
corn <- as_tibble(corn)
wheat <- as_tibble(wheat)
barley <- as_tibble(barley)
str(corn)
## Classes 'tbl_df', 'tbl' and 'data.frame': 6381 obs. of 4 variables:
## $ year : int 1866 1866 1866 1866 1866 1866 1866 1866 1866 1866 ...
## $ state : chr "Alabama" "Arkansas" "California" "Connecticut" ...
## $ farmed_area_acres : num 1050000 280000 42000 57000 200000 ...
## $ yield_bushels_per_acre: num 9 18 28 34 23 9 6 29 36.5 32 ...
str(wheat)
## Classes 'tbl_df', 'tbl' and 'data.frame': 5963 obs. of 4 variables:
## $ year : int 1866 1866 1866 1866 1866 1866 1866 1866 1866 1866 ...
## $ state : chr "Alabama" "Arkansas" "California" "Connecticut" ...
## $ farmed_area_acres : num 125000 50000 650000 2000 59000 245000 2300000 1550000 1190000 68000 ...
## $ yield_bushels_per_acre: num 5 6.5 18 17.5 11 4 10.5 10 13 19 ...
str(barley)
## Classes 'tbl_df', 'tbl' and 'data.frame': 4839 obs. of 4 variables:
## $ year : int 1866 1866 1866 1866 1866 1866 1866 1866 1866 1866 ...
## $ state : chr "Connecticut" "Illinois" "Indiana" "Iowa" ...
## $ farmed_area_acres : num 1000 96000 11000 66000 2000 10000 34000 7000 21000 20000 ...
## $ yield_bushels_per_acre: num 22.5 23.4 23 22 23 23.5 21.5 25.5 26 26 ...
# Write a function to convert acres to sq. yards
acres_to_sq_yards <- function(acres) {
acres * 4840
}
# Write a function to convert yards to meters
yards_to_meters <- function(yards) {
yards * 36 * 0.0254
}
# Write a function to convert sq. meters to hectares
sq_meters_to_hectares <- function(sq_meters) {
sq_meters / 10000
}
# Write a function to convert sq. yards to sq. meters
sq_yards_to_sq_meters <- function(sq_yards) {
sq_yards %>%
# Take the square root
sqrt() %>%
# Convert yards to meters
yards_to_meters() %>%
# Square it
raise_to_power(2)
}
# Write a function to convert acres to hectares
acres_to_hectares <- function(acres) {
acres %>%
# Convert acres to sq yards
acres_to_sq_yards() %>%
# Convert sq yards to sq meters
sqrt() %>%
yards_to_meters() %>%
raise_to_power(2) %>%
# Convert sq meters to hectares
sq_meters_to_hectares()
}
# Write a function to convert lb to kg
lbs_to_kgs <- function(lbs) {
lbs * 0.45359237
}
# Write a function to convert bushels to lbs
bushels_to_lbs <- function(bushels, crop) {
# Define a lookup table of scale factors
c(barley = 48, corn = 56, wheat = 60, volume = 8) %>%
# Extract the value for the crop
extract(crop) %>%
# Multiply by the no. of bushels
multiply_by(bushels)
}
# Write a function to convert bushels to kg
bushels_to_kgs <- function(bushels, crop) {
bushels %>%
# Convert bushels to lbs
bushels_to_lbs(crop) %>%
# Convert lbs to kgs
lbs_to_kgs()
}
# Write a function to convert bushels/acre to kg/ha
bushels_per_acre_to_kgs_per_hectare <- function(bushels_per_acre, crop = c("barley", "corn", "wheat")) {
# Match the crop argument
crop <- match.arg(crop)
bushels_per_acre %>%
# Convert bushels to kgs
bushels_to_kgs(crop) %>%
# Convert acres to ha
acres_to_hectares()
}
# View the corn dataset
glimpse(corn)
## Observations: 6,381
## Variables: 4
## $ year <int> 1866, 1866, 1866, 1866, 1866, 1866, 1866, 18...
## $ state <chr> "Alabama", "Arkansas", "California", "Connec...
## $ farmed_area_acres <dbl> 1050000, 280000, 42000, 57000, 200000, 12500...
## $ yield_bushels_per_acre <dbl> 9.0, 18.0, 28.0, 34.0, 23.0, 9.0, 6.0, 29.0,...
corn <- corn %>%
# Add some columns
mutate(
# Convert farmed area from acres to ha
farmed_area_ha = acres_to_hectares(farmed_area_acres),
# Convert yield from bushels/acre to kg/ha
yield_kg_per_ha = bushels_per_acre_to_kgs_per_hectare(yield_bushels_per_acre, crop = "corn")
)
# Wrap this code into a function
fortify_with_metric_units <- function(data, crop) {
data %>%
mutate(
farmed_area_ha = acres_to_hectares(farmed_area_acres),
yield_kg_per_ha = bushels_per_acre_to_kgs_per_hectare(yield_bushels_per_acre, crop = crop)
)
}
# Try it on the wheat dataset
wheat <- fortify_with_metric_units(wheat, "wheat")
# Using corn, plot yield (kg/ha) vs. year
ggplot(corn, aes(x=year, y=yield_kg_per_ha)) +
# Add a line layer, grouped by state
geom_line(aes(group = state)) +
# Add a smooth trend layer
geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
# Wrap this plotting code into a function
plot_yield_vs_year <- function(data) {
ggplot(data, aes(year, yield_kg_per_ha)) +
geom_line(aes(group = state)) +
geom_smooth()
}
# Test it on the wheat dataset
plot_yield_vs_year(wheat)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
usa_census_regions <- tibble::tibble(census_region=c('New England', 'New England', 'New England', 'New England', 'New England', 'New England', 'Mid-Atlantic', 'Mid-Atlantic', 'Mid-Atlantic', 'East North Central', 'East North Central', 'East North Central', 'East North Central', 'East North Central', 'West North Central', 'West North Central', 'West North Central', 'West North Central', 'West North Central', 'West North Central', 'West North Central', 'South Atlantic', 'South Atlantic', 'South Atlantic', 'South Atlantic', 'South Atlantic', 'South Atlantic', 'South Atlantic', 'South Atlantic', 'South Atlantic', 'East South Central', 'East South Central', 'East South Central', 'East South Central', 'West South Central', 'West South Central', 'West South Central', 'West South Central', 'Mountain', 'Mountain', 'Mountain', 'Mountain', 'Mountain', 'Mountain', 'Mountain', 'Mountain', 'Pacific', 'Pacific', 'Pacific', 'Pacific', 'Pacific'),
state=c('Connecticut', 'Maine', 'Massachusetts', 'New Hampshire', 'Rhode Island', 'Vermont', 'New Jersey', 'New York', 'Pennsylvania', 'Illinois', 'Indiana', 'Michigan', 'Ohio', 'Wisconsin', 'Iowa', 'Kansas', 'Minnesota', 'Missouri', 'Nebraska', 'North Dakota', 'South Dakota', 'Delaware', 'Florida', 'Georgia', 'Maryland', 'North Carolina', 'South Carolina', 'Virginia', 'District of Columbia', 'West Virginia', 'Alabama', 'Kentucky', 'Mississippi', 'Tennessee', 'Arkansas', 'Louisiana', 'Oklahoma', 'Texas', 'Arizona', 'Colorado', 'Idaho', 'Montana', 'Nevada', 'New Mexico', 'Utah', 'Wyoming', 'Alaska', 'California', 'Hawaii', 'Oregon', 'Washington')
)
usa_census_regions
## # A tibble: 51 x 2
## census_region state
## <chr> <chr>
## 1 New England Connecticut
## 2 New England Maine
## 3 New England Massachusetts
## 4 New England New Hampshire
## 5 New England Rhode Island
## 6 New England Vermont
## 7 Mid-Atlantic New Jersey
## 8 Mid-Atlantic New York
## 9 Mid-Atlantic Pennsylvania
## 10 East North Central Illinois
## # ... with 41 more rows
# Inner join the corn dataset to usa_census_regions by state
corn <- corn %>%
inner_join(usa_census_regions, by = "state")
# Wrap this code into a function
fortify_with_census_region <- function(data) {
data %>%
inner_join(usa_census_regions, by = "state")
}
# Try it on the wheat dataset
wheat <- fortify_with_census_region(wheat)
# Plot yield vs. year for the corn dataset
plot_yield_vs_year(corn) +
facet_wrap(~census_region)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
# Wrap this code into a function
plot_yield_vs_year_by_region <- function(data) {
plot_yield_vs_year(data) +
facet_wrap(vars(census_region))
}
# Try it on the wheat dataset
plot_yield_vs_year_by_region(wheat)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
# Wrap the model code into a function
run_gam_yield_vs_year_by_region <- function(data) {
mgcv::gam(yield_kg_per_ha ~ s(year) + census_region, data = data)
}
# Try it on the wheat dataset
wheat_model <- run_gam_yield_vs_year_by_region(wheat)
corn_model <- run_gam_yield_vs_year_by_region(wheat)
# Make predictions in 2050
predict_this <- data.frame(year = 2050, census_region = unique(usa_census_regions$census_region))
# Predict the yield
pred_yield_kg_per_ha <- predict(corn_model, predict_this, type = "response")
predict_this %>%
# Add the prediction as a column of predict_this
mutate(pred_yield_kg_per_ha = pred_yield_kg_per_ha)
## year census_region pred_yield_kg_per_ha
## 1 2050 New England 901.7706
## 2 2050 Mid-Atlantic 888.5455
## 3 2050 East North Central 895.8256
## 4 2050 West North Central 816.2401
## 5 2050 South Atlantic 831.8758
## 6 2050 East South Central 816.5198
## 7 2050 West South Central 780.2498
## 8 2050 Mountain 893.8168
## 9 2050 Pacific 934.7567
# Wrap this prediction code into a function
predict_yields <- function(model, year) {
predict_this <- data.frame(year = year, census_region = unique(usa_census_regions$census_region))
pred_yield_kg_per_ha <- predict(model, predict_this, type = "response")
predict_this %>%
mutate(pred_yield_kg_per_ha = pred_yield_kg_per_ha)
}
# Try it on the wheat dataset
predict_yields(wheat_model, year=2050)
## year census_region pred_yield_kg_per_ha
## 1 2050 New England 901.7706
## 2 2050 Mid-Atlantic 888.5455
## 3 2050 East North Central 895.8256
## 4 2050 West North Central 816.2401
## 5 2050 South Atlantic 831.8758
## 6 2050 East South Central 816.5198
## 7 2050 West South Central 780.2498
## 8 2050 Mountain 893.8168
## 9 2050 Pacific 934.7567
# From previous step
fortified_barley <- barley %>%
fortify_with_metric_units(crop="barley") %>%
fortify_with_census_region()
# Plot yield vs. year by region
plot_yield_vs_year_by_region(fortified_barley)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
fortified_barley %>%
# Run a GAM of yield vs. year by region
run_gam_yield_vs_year_by_region() %>%
# Make predictions of yields in 2050
predict_yields(year=2050)
## year census_region pred_yield_kg_per_ha
## 1 2050 New England 692.7372
## 2 2050 Mid-Atlantic 695.6051
## 3 2050 East North Central 689.5561
## 4 2050 West North Central 629.5246
## 5 2050 South Atlantic 695.7666
## 6 2050 East South Central 657.6750
## 7 2050 West South Central 595.9212
## 8 2050 Mountain 759.6959
## 9 2050 Pacific 698.9621
Chapter 1 - Introduction
Introduction:
Grammar of Graphics:
Layers:
Example code includes:
data(mtcars)
# Explore the mtcars data frame with str()
str(mtcars)
## 'data.frame': 32 obs. of 11 variables:
## $ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
## $ cyl : num 6 6 4 6 8 6 8 4 4 6 ...
## $ disp: num 160 160 108 258 360 ...
## $ hp : num 110 110 93 110 175 105 245 62 95 123 ...
## $ drat: num 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
## $ wt : num 2.62 2.88 2.32 3.21 3.44 ...
## $ qsec: num 16.5 17 18.6 19.4 17 ...
## $ vs : num 0 0 1 1 0 1 0 1 1 1 ...
## $ am : num 1 1 1 0 0 0 0 0 0 0 ...
## $ gear: num 4 4 4 3 3 3 3 4 4 4 ...
## $ carb: num 4 4 1 1 2 1 4 2 2 4 ...
# Execute the following command
ggplot(mtcars, aes(cyl, mpg)) +
geom_point()
# Change the command below so that cyl is treated as factor
ggplot(mtcars, aes(factor(cyl), mpg)) +
geom_point()
# Edit to add a color aesthetic mapped to disp
ggplot(mtcars, aes(wt, mpg, color=disp)) +
geom_point()
# Change the color aesthetic to a size aesthetic
ggplot(mtcars, aes(wt, mpg, size = disp)) +
geom_point()
data(diamonds)
str(diamonds)
## Classes 'tbl_df', 'tbl' and 'data.frame': 53940 obs. of 10 variables:
## $ carat : num 0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
## $ cut : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
## $ color : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
## $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
## $ depth : num 61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
## $ table : num 55 61 65 58 58 57 57 55 61 61 ...
## $ price : int 326 326 327 334 335 336 336 337 337 338 ...
## $ x : num 3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
## $ y : num 3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
## $ z : num 2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...
# Add geom_point() with +
ggplot(diamonds, aes(carat, price)) +
geom_point()
# Add geom_smooth() with +
ggplot(diamonds, aes(carat, price)) +
geom_point() +
geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
# Make the points 40% opaque
ggplot(diamonds, aes(carat, price, color = clarity)) +
geom_point(alpha=0.4) +
geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
# Draw a ggplot
plt_price_vs_carat <- ggplot(
# Use the diamonds dataset
diamonds,
# For the aesthetics, map x to carat and y to price
aes(x=carat, y=price)
)
# Add a point layer to plt_price_vs_carat
plt_price_vs_carat +
geom_point()
# Edit this to make points 20% opaque: plt_price_vs_carat_transparent
plt_price_vs_carat_transparent <- plt_price_vs_carat +
geom_point(alpha=0.2)
# See the plot
plt_price_vs_carat_transparent
# Edit this to map color to clarity,
# Assign the updated plot to a new object
plt_price_vs_carat_by_clarity <- plt_price_vs_carat +
geom_point(aes(color=clarity))
# See the plot
plt_price_vs_carat_by_clarity
Chapter 2 - Aesthetics
Visible Aesthetics:
Using Attributes:
Modifying Aestehtics:
Aesthetics Best Practices:
Example code includes:
mtcars <- mtcars %>%
mutate(fcyl=factor(cyl), fam=factor(am, levels=c(0, 1), labels=c("automatic", "manual")))
str(mtcars)
## 'data.frame': 32 obs. of 13 variables:
## $ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
## $ cyl : num 6 6 4 6 8 6 8 4 4 6 ...
## $ disp: num 160 160 108 258 360 ...
## $ hp : num 110 110 93 110 175 105 245 62 95 123 ...
## $ drat: num 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
## $ wt : num 2.62 2.88 2.32 3.21 3.44 ...
## $ qsec: num 16.5 17 18.6 19.4 17 ...
## $ vs : num 0 0 1 1 0 1 0 1 1 1 ...
## $ am : num 1 1 1 0 0 0 0 0 0 0 ...
## $ gear: num 4 4 4 3 3 3 3 4 4 4 ...
## $ carb: num 4 4 1 1 2 1 4 2 2 4 ...
## $ fcyl: Factor w/ 3 levels "4","6","8": 2 2 1 2 3 2 3 1 1 2 ...
## $ fam : Factor w/ 2 levels "automatic","manual": 2 2 2 1 1 1 1 1 1 1 ...
# Map x to mpg and y to fcyl
ggplot(mtcars, aes(x=mpg, y=fcyl)) +
geom_point()
# Swap mpg and fcyl
ggplot(mtcars, aes(x=fcyl, y=mpg)) +
geom_point()
# Map x to wt, y to mpg and color to fcyl
ggplot(mtcars, aes(x=wt, y=mpg, color=fcyl)) +
geom_point()
ggplot(mtcars, aes(wt, mpg, color = fcyl)) +
# Set the shape and size of the points
geom_point(shape=1, size=4)
# Map color to fam
ggplot(mtcars, aes(wt, mpg, fill = fcyl, color=fam)) +
geom_point(shape = 21, size = 4, alpha = 0.6)
# Base layer
plt_mpg_vs_wt <- ggplot(mtcars, aes(wt, mpg))
# Map fcyl to shape, not alpha
plt_mpg_vs_wt +
geom_point(aes(shape = fcyl))
# Base layer
plt_mpg_vs_wt <- ggplot(mtcars, aes(wt, mpg))
# Use text layer and map fcyl to label
plt_mpg_vs_wt +
geom_text(aes(label = fcyl))
# A hexadecimal color
my_blue <- "#4ABEFF"
# Change the color mapping to a fill mapping
ggplot(mtcars, aes(wt, mpg, fill = fcyl)) +
# Set point size and shape
geom_point(color=my_blue, size=10, shape=1)
ggplot(mtcars, aes(wt, mpg, color = fcyl)) +
# Add point layer with alpha 0.5
geom_point(alpha=0.5)
ggplot(mtcars, aes(wt, mpg, color = fcyl)) +
# Add text layer with label rownames(mtcars) and color red
geom_text(label=rownames(mtcars), color="red")
ggplot(mtcars, aes(wt, mpg, color = fcyl)) +
# Add points layer with shape 24 and color yellow
geom_point(shape=24, color="yellow")
# 5 aesthetics: add a mapping of size to hp / wt
ggplot(mtcars, aes(mpg, qsec, color = fcyl, shape = fam, size=hp/wt)) +
geom_point()
ggplot(mtcars, aes(fcyl, fill = fam)) +
geom_bar() +
# Set the axis labels
labs(x="Number of Cylinders", y="Count")
palette <- c(automatic = "#377EB8", manual = "#E41A1C")
ggplot(mtcars, aes(fcyl, fill = fam)) +
geom_bar() +
labs(x = "Number of Cylinders", y = "Count") +
# Set the fill color scale
scale_fill_manual("Transmission", values = palette)
palette <- c(automatic = "#377EB8", manual = "#E41A1C")
# Set the position
ggplot(mtcars, aes(fcyl, fill = fam)) +
geom_bar(position="dodge") +
labs(x = "Number of Cylinders", y = "Count") +
scale_fill_manual("Transmission", values = palette)
ggplot(mtcars, aes(mpg, 0)) +
geom_jitter() +
# Set the y-axis limits
ylim(c(-2, 2))
Chapter 3 - Geometries
Scatter Plots:
Histograms:
Bar Plots:
Line Plots:
Example code includes:
# Plot price vs. carat, colored by clarity
plt_price_vs_carat_by_clarity <- ggplot(diamonds, aes(carat, price, color = clarity))
# Set transparency to 0.5
plt_price_vs_carat_by_clarity +
geom_point(alpha = 0.5, shape = 16)
# Plot base
plt_mpg_vs_fcyl_by_fam <- ggplot(mtcars, aes(fcyl, mpg, color = fam))
# Default points are shown for comparison
plt_mpg_vs_fcyl_by_fam +
geom_point()
# Now jitter and dodge the point positions
plt_mpg_vs_fcyl_by_fam +
geom_point(position = position_jitterdodge(jitter.width=0.3, dodge.width=0.3))
data(iris)
ggplot(iris, aes(Sepal.Length, Sepal.Width, color = Species)) +
# Swap for jitter layer with width 0.1
geom_jitter(width=0.1, alpha=0.5)
ggplot(iris, aes(Sepal.Length, Sepal.Width, color = Species)) +
# Set the position to jitter
geom_point(position="jitter", alpha = 0.5)
ggplot(iris, aes(Sepal.Length, Sepal.Width, color = Species)) +
# Use a jitter position function with width 0.1
geom_point(position=position_jitter(width=0.1), alpha = 0.5)
data(Vocab, package="carData")
# Examine the structure of Vocab
str(Vocab)
## 'data.frame': 30351 obs. of 4 variables:
## $ year : num 1974 1974 1974 1974 1974 ...
## $ sex : Factor w/ 2 levels "Female","Male": 2 2 1 1 1 2 2 2 1 1 ...
## $ education : num 14 16 10 10 12 16 17 10 12 11 ...
## $ vocabulary: num 9 9 9 5 8 8 9 5 3 5 ...
## - attr(*, "na.action")= 'omit' Named int 1 2 3 4 5 6 7 8 9 10 ...
## ..- attr(*, "names")= chr "19720001" "19720002" "19720003" "19720004" ...
# Plot vocabulary vs. education
ggplot(Vocab, aes(x=education, y=vocabulary)) +
# Add a point layer
geom_point()
ggplot(Vocab, aes(education, vocabulary)) +
# Set the shape to 1
geom_jitter(alpha = 0.2, shape=1)
datacamp_light_blue <- "#51A8C9"
ggplot(mtcars, aes(x=mpg, y=..density..)) +
# Set the fill color to datacamp_light_blue
geom_histogram(binwidth = 1, fill=datacamp_light_blue)
ggplot(mtcars, aes(mpg, fill = fam)) +
# Change the position to identity, with transparency 0.4
geom_histogram(binwidth = 1, position = "fill")
## Warning: Removed 16 rows containing missing values (geom_bar).
ggplot(mtcars, aes(mpg, fill = fam)) +
# Change the position to identity, with transparency 0.4
geom_histogram(binwidth = 1, position = "identity", alpha=0.4)
# Plot fcyl, filled by fam
ggplot(mtcars, aes(x=fcyl, fill=fam)) +
# Add a bar layer
geom_bar()
ggplot(mtcars, aes(x=fcyl, fill = fam)) +
# Set the position to "fill"
geom_bar(position="fill")
ggplot(mtcars, aes(fcyl, fill = fam)) +
# Change the position to "dodge"
geom_bar(position = "dodge")
ggplot(mtcars, aes(cyl, fill = fam)) +
# Change position to use the functional form, with width 0.2
geom_bar(position = position_dodge(width=0.2))
ggplot(mtcars, aes(cyl, fill = fam)) +
# Set the transparency to 0.6
geom_bar(position = position_dodge(width = 0.2), alpha=0.6)
# Plot education, filled by vocabulary
ggplot(Vocab, aes(x=education, fill = factor(vocabulary))) +
# Add a bar layer with position "fill"
geom_bar(position="fill")
# Plot education, filled by vocabulary
ggplot(Vocab, aes(education, fill = factor(vocabulary))) +
# Add a bar layer with position "fill"
geom_bar(position = "fill") +
# Add a brewer fill scale with default palette
scale_fill_brewer()
## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette Blues is 9
## Returning the palette you asked for with that many colors
data(economics)
# Print the head of economics
head(economics)
## # A tibble: 6 x 6
## date pce pop psavert uempmed unemploy
## <date> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1967-07-01 507. 198712 12.6 4.5 2944
## 2 1967-08-01 510. 198911 12.6 4.7 2945
## 3 1967-09-01 516. 199113 11.9 4.6 2958
## 4 1967-10-01 512. 199311 12.9 4.9 3143
## 5 1967-11-01 517. 199498 12.8 4.7 3066
## 6 1967-12-01 525. 199657 11.8 4.8 3018
# Using economics, plot unemploy vs. date
ggplot(economics, aes(x=date, y=unemploy)) +
# Make it a line plot
geom_line()
# Change the y-axis to the proportion of the population that is unemployed
ggplot(economics, aes(x=date, y=unemploy/pop)) +
geom_line()
load("./RInputFiles/fish.RData")
# Plot the Rainbow Salmon time series
ggplot(fish.species, aes(x = Year, y = Rainbow)) +
geom_line()
# Plot the Pink Salmon time series
ggplot(fish.species, aes(x = Year, y = Pink)) +
geom_line()
# Plot multiple time-series by grouping by species
ggplot(fish.tidy, aes(Year, Capture)) +
geom_line(aes(group = Species))
# Plot multiple time-series by coloring by species
ggplot(fish.tidy, aes(x = Year, y = Capture, color = Species)) +
geom_line()
Chapter 4 - Themes
Themes from Scratch:
Theme Flexibility:
Effective Explanatory Plots:
Example code includes:
recess <- data.frame(begin=as.Date(c('1969-12-01', '1973-11-01', '1980-01-01', '1981-07-01', '1990-07-01', '2001-03-01', '2007-12-01')),
end=as.Date(c('1970-11-01', '1975-03-01', '1980-07-01', '1982-11-01', '1991-03-01', '2001-11-01', '2009-07-30')),
event=c('Fiscal & Monetary\ntightening', '1973 Oil crisis', 'Double dip I', 'Double dip II', 'Oil price shock', 'Dot-com bubble', 'Sub-prime\nmortgage crisis'),
y=c(0.01416, 0.02067, 0.02951, 0.03419, 0.02767, 0.0216, 0.02521)
)
recess
## begin end event y
## 1 1969-12-01 1970-11-01 Fiscal & Monetary\ntightening 0.01416
## 2 1973-11-01 1975-03-01 1973 Oil crisis 0.02067
## 3 1980-01-01 1980-07-01 Double dip I 0.02951
## 4 1981-07-01 1982-11-01 Double dip II 0.03419
## 5 1990-07-01 1991-03-01 Oil price shock 0.02767
## 6 2001-03-01 2001-11-01 Dot-com bubble 0.02160
## 7 2007-12-01 2009-07-30 Sub-prime\nmortgage crisis 0.02521
events <- recess %>%
select(begin, y) %>%
rename(date=begin)
events
## date y
## 1 1969-12-01 0.01416
## 2 1973-11-01 0.02067
## 3 1980-01-01 0.02951
## 4 1981-07-01 0.03419
## 5 1990-07-01 0.02767
## 6 2001-03-01 0.02160
## 7 2007-12-01 0.02521
# Change the y-axis to the proportion of the population that is unemployed
plt_prop_unemployed_over_time <- ggplot(economics, aes(x=date, y=unemploy/pop)) +
geom_line(lwd=1.25) +
labs(title="The percentage of unemployed Americans\nincreases sharply during recessions") +
geom_rect(data=recess, aes(xmin=begin, xmax=end, ymin=0.01, ymax=0.055, fill="red"),
inherit.aes=FALSE, alpha=0.25
) +
geom_label(data=recess, aes(x=begin, y=y, label=event))
# View the default plot
plt_prop_unemployed_over_time
# Remove legend entirely
plt_prop_unemployed_over_time +
theme(legend.position="none")
# Position the legend at the bottom of the plot
plt_prop_unemployed_over_time +
theme(legend.position="bottom")
# Position the legend inside the plot at (0.6, 0.1)
plt_prop_unemployed_over_time +
theme(legend.position=c(0.6, 0.1))
plt_prop_unemployed_over_time +
theme(
# For all rectangles, set the fill color to grey92
rect = element_rect(fill = "grey92"),
# For the legend key, turn off the outline
legend.key = element_rect(color=NA)
)
plt_prop_unemployed_over_time +
theme(
rect = element_rect(fill = "grey92"),
legend.key = element_rect(color = NA),
# Turn off axis ticks
axis.ticks = element_blank(),
# Turn off the panel grid
panel.grid = element_blank()
)
plt_prop_unemployed_over_time +
theme(
rect = element_rect(fill = "grey92"),
legend.key = element_rect(color = NA),
axis.ticks = element_blank(),
panel.grid = element_blank(),
# Add major y-axis panel grid lines back
panel.grid.major.y = element_line(
# Set the color to white
color="white",
# Set the size to 0.5
size=0.5,
# Set the line type to dotted
linetype="dotted"
),
# Set the axis text color to grey25
axis.text = element_text(color="grey25"),
# Set the plot title font face to italic and font size to 16
plot.title = element_text(size=16, face="italic")
)
plt_mpg_vs_wt_by_cyl <- ggplot(mtcars, aes(x=wt, y=mpg, color=fcyl)) +
geom_point() +
labs(x="Weight (1000s of lbs)", y="Miles per Gallon")
# View the original plot
plt_mpg_vs_wt_by_cyl
plt_mpg_vs_wt_by_cyl +
theme(
# Set the axis tick length to 2 lines
axis.ticks.length=unit(2, "lines")
)
plt_mpg_vs_wt_by_cyl +
theme(
# Set the legend key size to 3 centimeters
legend.key.size = unit(3, "cm")
)
plt_mpg_vs_wt_by_cyl +
theme(
# Set the legend margin to (20, 30, 40, 50) points
legend.margin = margin(20, 30, 40, 50, "pt")
)
plt_mpg_vs_wt_by_cyl +
theme(
# Set the plot margin to (10, 30, 50, 70) millimeters
plot.margin=margin(10, 30, 50, 70, "mm")
)
# Whitespace means all the non-visible margins and spacing in the plot.
# To set a single whitespace value, use unit(x, unit), where x is the amount and unit is the unit of measure.
# Borders require you to set 4 positions, so use margin(top, right, bottom, left, unit)
# To remember the margin order, think TRouBLe
# The default unit is "pt" (points), which scales well with text
# Other options include "cm", "in" (inches) and "lines" (of text)
# Add a black and white theme
plt_prop_unemployed_over_time +
theme_bw()
# Add a classic theme
plt_prop_unemployed_over_time +
theme_classic()
# Add a void theme
plt_prop_unemployed_over_time +
theme_void()
# theme_gray() is the default.
# theme_bw() is useful when you use transparency.
# theme_classic() is more traditional.
# theme_void() removes everything but the data.
# Use the fivethirtyeight theme
plt_prop_unemployed_over_time +
ggthemes::theme_fivethirtyeight()
# Use Tufte's theme
plt_prop_unemployed_over_time +
ggthemes::theme_tufte()
# Use the Wall Street Journal theme
plt_prop_unemployed_over_time +
ggthemes::theme_wsj()
theme_recession <- theme(
rect = element_rect(fill = "grey92"),
legend.key = element_rect(color = NA),
axis.ticks = element_blank(),
panel.grid = element_blank(),
panel.grid.major.y = element_line(color = "white", size = 0.5, linetype = "dotted"),
axis.text = element_text(color = "grey25"),
plot.title = element_text(face = "italic", size = 16),
legend.position = c(0.6, 0.1)
)
theme_tufte_recession <- ggthemes::theme_tufte() + theme_recession
themeOld <- theme_get()
theme_set(themeOld)
# Set theme_tufte_recession as the default theme
theme_set(theme_tufte_recession)
plt_prop_unemployed_over_time +
# Add Tufte's theme
ggthemes::theme_tufte()
# Draw the plot (without explicitly adding a theme)
plt_prop_unemployed_over_time
plt_prop_unemployed_over_time +
ggthemes::theme_tufte() +
# Add individual theme elements
theme(
# Turn off the legend
legend.position = "none",
# Turn off the axis ticks
axis.ticks = element_blank()
)
plt_prop_unemployed_over_time +
ggthemes::theme_tufte() +
theme(
legend.position = "none",
axis.ticks = element_blank(),
axis.title = element_text(color = "grey60"),
axis.text = element_text(color = "grey60"),
# Set the panel gridlines major y values
panel.grid.major.y = element_line(
# Set the color to grey60
color="grey60",
# Set the size to 0.25
size=0.25,
# Set the linetype to dotted
linetype="dotted"
)
)
theme_set(themeOld)
data(gapminder, package="gapminder")
ctry <- c('Swaziland', 'Mozambique', 'Zambia', 'Sierra Leone', 'Lesotho', 'Angola', 'Zimbabwe', 'Afghanistan', 'Central African Republic', 'Liberia', 'Canada', 'France', 'Israel', 'Sweden', 'Spain', 'Australia', 'Switzerland', 'Iceland', 'Hong Kong, China', 'Japan')
gm2007 <- gapminder %>%
filter(year==2007, country %in% ctry) %>%
select(country, lifeExp, continent) %>%
arrange(lifeExp)
gm2007
## # A tibble: 20 x 3
## country lifeExp continent
## <fct> <dbl> <fct>
## 1 Swaziland 39.6 Africa
## 2 Mozambique 42.1 Africa
## 3 Zambia 42.4 Africa
## 4 Sierra Leone 42.6 Africa
## 5 Lesotho 42.6 Africa
## 6 Angola 42.7 Africa
## 7 Zimbabwe 43.5 Africa
## 8 Afghanistan 43.8 Asia
## 9 Central African Republic 44.7 Africa
## 10 Liberia 45.7 Africa
## 11 Canada 80.7 Americas
## 12 France 80.7 Europe
## 13 Israel 80.7 Asia
## 14 Sweden 80.9 Europe
## 15 Spain 80.9 Europe
## 16 Australia 81.2 Oceania
## 17 Switzerland 81.7 Europe
## 18 Iceland 81.8 Europe
## 19 Hong Kong, China 82.2 Asia
## 20 Japan 82.6 Asia
# Set the color scale
palette <- RColorBrewer::brewer.pal(5, "RdYlBu")[-(2:4)]
# Add a title and caption
plt_country_vs_lifeExp <- ggplot(gm2007, aes(x = lifeExp, y = fct_reorder(country, lifeExp), color = lifeExp)) +
geom_point(size = 4) +
geom_segment(aes(xend = 30, yend = country), size = 2) +
geom_text(aes(label = round(lifeExp,1)), color = "white", size = 1.5) +
scale_x_continuous("", expand = c(0,0), limits = c(30,90), position = "top") +
scale_color_gradientn(colors = palette) +
labs(title="Highest and lowest life expectancies, 2007", caption="Source: gapminder")
plt_country_vs_lifeExp
# Define the theme
plt_country_vs_lifeExp +
theme_classic() +
theme(axis.line.y = element_blank(), axis.ticks.y = element_blank(), axis.text = element_text(color="black"), axis.title = element_blank(), legend.position = "none")
global_mean <- gapminder %>% filter(year==2007) %>% pull(lifeExp) %>% mean()
x_start <- global_mean + 4
y_start <- 5.5
x_end <- global_mean
y_end <- 7.5
# Add text
plt_country_vs_lifeExp +
theme_classic() +
theme(axis.line.y = element_blank(), axis.ticks.y = element_blank(), axis.text = element_text(color="black"), axis.title = element_blank(), legend.position = "none") +
geom_vline(xintercept = global_mean, color = "grey40", linetype = 3) +
annotate("text", x = x_start, y = y_start, label = "The\nglobal\naverage", vjust = 1, size = 3, color = "grey40")
# Add a curve
plt_country_vs_lifeExp +
theme_classic() +
theme(axis.line.y = element_blank(), axis.ticks.y = element_blank(), axis.text = element_text(color="black"), axis.title = element_blank(), legend.position = "none") +
geom_vline(xintercept = global_mean, color = "grey40", linetype = 3) +
annotate("text", x = x_start, y = y_start, label = "The\nglobal\naverage", vjust = 1, size = 3, color = "grey40") +
annotate("curve", x = x_start, y = y_start, xend = x_end, yend = y_end, arrow = arrow(length = unit(0.2, "cm"), type = "closed"), color = "grey40")
theme_set(themeOld)
Chapter 1 - Data Pre-processing and Visualization
Data Normalization:
Handling Missing Data:
Detecting Anomalies in Data:
Example code includes:
# fifa_sample <- read_csv("./RInputFiles/fifa_sample.xls")
# glimpse(fifa_sample)
apps <- read_csv("./RInputFiles/googleplaystore.xls")
## Parsed with column specification:
## cols(
## App = col_character(),
## Category = col_character(),
## Rating = col_double(),
## Reviews = col_double(),
## Size = col_character(),
## Installs = col_character(),
## Type = col_character(),
## Price = col_character(),
## `Content Rating` = col_character(),
## Genres = col_character(),
## `Last Updated` = col_character(),
## `Current Ver` = col_character(),
## `Android Ver` = col_character()
## )
## Warning: 2 parsing failures.
## row col expected actual file
## 10473 Reviews no trailing characters M './RInputFiles/googleplaystore.xls'
## 10473 NA 13 columns 12 columns './RInputFiles/googleplaystore.xls'
apps <- apps[-10473, ]
glimpse(apps)
## Observations: 10,840
## Variables: 13
## $ App <chr> "Photo Editor & Candy Camera & Grid & ScrapBook", ...
## $ Category <chr> "ART_AND_DESIGN", "ART_AND_DESIGN", "ART_AND_DESIG...
## $ Rating <dbl> 4.1, 3.9, 4.7, 4.5, 4.3, 4.4, 3.8, 4.1, 4.4, 4.7, ...
## $ Reviews <dbl> 159, 967, 87510, 215644, 967, 167, 178, 36815, 137...
## $ Size <chr> "19M", "14M", "8.7M", "25M", "2.8M", "5.6M", "19M"...
## $ Installs <chr> "10,000+", "500,000+", "5,000,000+", "50,000,000+"...
## $ Type <chr> "Free", "Free", "Free", "Free", "Free", "Free", "F...
## $ Price <chr> "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", ...
## $ `Content Rating` <chr> "Everyone", "Everyone", "Everyone", "Teen", "Every...
## $ Genres <chr> "Art & Design", "Art & Design;Pretend Play", "Art ...
## $ `Last Updated` <chr> "January 7, 2018", "January 15, 2018", "August 1, ...
## $ `Current Ver` <chr> "1.0.0", "2.0.0", "1.2.4", "Varies with device", "...
## $ `Android Ver` <chr> "4.0.3 and up", "4.0.3 and up", "4.0.3 and up", "4...
cars <- read_csv("./RInputFiles/car-fuel-consumption-1.xls")
## Parsed with column specification:
## cols(
## distance = col_number(),
## consume = col_number(),
## speed = col_double(),
## temp_inside = col_number(),
## temp_outside = col_double(),
## specials = col_character(),
## gas_type = col_character(),
## AC = col_double(),
## rain = col_double(),
## sun = col_double(),
## `refill liters` = col_number(),
## `refill gas` = col_character()
## )
glimpse(cars)
## Observations: 388
## Variables: 12
## $ distance <dbl> 28, 12, 112, 129, 185, 83, 78, 123, 49, 119, 124, 1...
## $ consume <dbl> 5, 42, 55, 39, 45, 64, 44, 5, 64, 53, 56, 46, 59, 5...
## $ speed <dbl> 26, 30, 38, 36, 46, 50, 43, 40, 26, 30, 42, 38, 59,...
## $ temp_inside <dbl> 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 2...
## $ temp_outside <dbl> 12, 13, 15, 14, 15, 10, 11, 6, 4, 9, 4, 0, 10, 12, ...
## $ specials <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ gas_type <chr> "E10", "E10", "E10", "E10", "E10", "E10", "E10", "E...
## $ AC <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ rain <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ sun <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ `refill liters` <dbl> 45, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ `refill gas` <chr> "E10", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
fifa_sample <- tibble::tibble(SP=c(43, 70, 47, 22, 74, 45, 65, 71, 66, 62, 58, 55, 57, 15, 67, 66, 46, 65, 71, 80, 68, 62, 49, 70, 55, 17, 56, 48, 25, 62, 14, 55, 17, 43, 62, 63, 52, 62, 58, 81, 73, 59, 60, 66, 43, 59, 58, 79, 16, 64, 14, 12, 68, 78, 36, 52, 59, 67, 75, 80, 38, 73, 56, 80, 66, 68, 72, 41, 72, 51, 66, 37, 75, 19, 15, 34, 69, 86, 74, 57, 80, 51, 76, 63, 22, 76, 43, 22, 46, 39, 55, 81, 77, 62, 81, 19, 70, 74, 60, 59),
RA=c(190, 12, 353, 669, 2.5, 2.6, 406, 18.6, 5.1, 653, 900, 450, 3.9, 713, 1.9, 4.8, 140, 1.6, 1.3, 38.1, 1.6, 953, 891, 2.2, 357, 149, 3.4, 1.7, 7, 347, 105, 2.4, 1.9, 73, 4.8, 801, 3.8, 1.2, 9.5, 6.8, 2.5, 656, 1.5, 7, 631, 1.9, 125, 6.4, 2.6, 648, 1.3, 1.3, 6.7, 20.8, 3.6, 305, 1, 357, 7.5, 17.1, 140, 1.4, 3, 10.3, 795, 6.5, 2.6, 530, 2.7, 495, 12.8, 850, 1.2, 436, 639, 945, 619, 164, 10.2, 639, 5, 365, 1.2, 350, 63, 11.7, 8.7, 534, 2.5, 413, 225, 15.2, 1.6, 534, 14.7, 119, 6.9, 20, 1.5, 512)
)
# Glimpse at the dataset
glimpse(fifa_sample)
## Observations: 100
## Variables: 2
## $ SP <dbl> 43, 70, 47, 22, 74, 45, 65, 71, 66, 62, 58, 55, 57, 15, 67, 66, ...
## $ RA <dbl> 190.0, 12.0, 353.0, 669.0, 2.5, 2.6, 406.0, 18.6, 5.1, 653.0, 90...
# Compute the scale of every feature
(fifa_scales <- sapply(fifa_sample, range))
## SP RA
## [1,] 12 1
## [2,] 86 953
# Plot fifa_sample data
ggplot(fifa_sample, aes(x=SP, y=RA)) +
geom_point(colour="blue", size=5) +
labs(title = "Original data", x="Shot power", y="Release amount (millions EUR)") +
theme(plot.title = element_text(size=22), text = element_text(size=18)) +
scale_x_continuous(breaks = round(seq(0, max(fifa_sample$SP), by = 5),1))
# Apply max-min and standardization: fifa_normalized
fifa_normalized <- fifa_sample %>%
mutate(SP_MaxMin = (SP-min(SP))/(max(SP)-min(SP)), RA_MaxMin = (RA-min(RA))/(max(RA)-min(RA)),
SP_ZScore = (SP - mean(SP)) / sd(SP), RA_ZScore = (RA - mean(RA)) / sd(RA)
)
# Compute the scale of every feature: fifa_normalized_scales
(fifa_normalized_scales <- sapply(fifa_normalized, range))
## SP RA SP_MaxMin RA_MaxMin SP_ZScore RA_ZScore
## [1,] 12 1 0 0 -2.265794 -0.7142706
## [2,] 86 953 1 1 1.556152 2.6029590
# Boxplot of original and normalized distributions
boxplot(fifa_normalized[, c("SP", "RA")], main = 'Original')
boxplot(fifa_normalized[, c("SP_MaxMin", "RA_MaxMin")], main = 'Max-Min')
boxplot(fifa_normalized[, c("SP_ZScore", "RA_ZScore")], main = 'Z-Score')
bands <- tibble::tibble(Blade_pressure=c('20', '20', '30', '30', '30', '28', '30', '28', '60', '32', '30', '40', '30', '25', '20', '?', '?', '?', '?', '?', '30', '30', '25', '30', '25', '20', '30', '25', '30', '35', '28', '30', '22', '20', '35', '?', '30', '28', '31', '34', '32', '?', '30', '30', '24', '20', '35', '25', '25', '34', '16', '20', '28', '25', '30', '35', '46', '50', '25', '30'),
Roughness=c('0.75', '0.75', '?', '0.312', '0.75', '0.438', '0.75', '0.75', '0.75', '1.0', '0.75', '0.75', '1.0', '0.625', '1.0', '1.0', '?', '?', '0.75', '0.75', '0.812', '0.812', '0.812', '1.0', '1.0', '1.0', '1.0', '1.0', '0.75', '0.75', '0.75', '0.75', '0.625', '0.625', '0.75', '0.875', '0.625', '1.0', '1.0', '0.75', '1.0', '0.875', '0.875', '0.812', '0.75', '0.75', '0.812', '0.625', '0.625', '0.5', '0.75', '0.75', '0.75', '0.875', '0.625', '?', '0.75', '0.75', '0.625', '0.875'),
Ink_pct=c('50.5', '54.9', '53.8', '55.6', '57.5', '53.8', '62.5', '62.5', '60.2', '45.5', '48.5', '52.6', '50.0', '59.5', '49.5', '62.5', '62.5', '58.8', '54.9', '56.2', '58.8', '62.5', '58.1', '62.5', '57.5', '57.5', '57.5', '58.8', '58.8', '58.8', '45.0', '43.5', '54.3', '53.2', '58.8', '63.0', '58.1', '58.8', '54.3', '62.5', '58.1', '61.7', '55.6', '55.6', '58.1', '56.2', '58.8', '57.5', '58.8', '61.0', '50.5', '50.5', '58.8', '58.8', '62.5', '55.6', '58.8', '62.5', '52.6', '54.9'),
Ink_temperature=c('17.0', '15.0', '16.0', '16.0', '17.0', '16.8', '16.5', '16.5', '12.0', '16.0', '16.0', '14.0', '15.0', '14.5', '16.0', '15.0', '14.0', '15.5', '16.4', '16.5', '16.0', '15.0', '16.3', '15.8', '14.5', '14.0', '15.0', '15.2', '15.0', '17.0', '16.0', '16.5', '14.1', '14.0', '17.0', '15.4', '15.0', '16.0', '15.0', '15.0', '16.0', '15.4', '16.0', '16.3', '15.8', '16.6', '17.0', '13.0', '14.0', '15.9', '17.0', '16.5', '15.0', '16.5', '18.0', '17.0', '12.0', '16.0', '14.6', '24.5'),
Band_type=c('band', 'noband', 'noband', 'noband', 'noband', 'noband', 'noband', 'noband', 'band', 'noband', 'noband', 'band', 'band', 'band', 'noband', 'band', 'noband', 'noband', 'noband', 'noband', 'noband', 'noband', 'noband', 'band', 'band', 'band', 'noband', 'noband', 'band', 'band', 'noband', 'noband', 'noband', 'noband', 'band', 'noband', 'band', 'noband', 'noband', 'noband', 'noband', 'noband', 'noband', 'noband', 'noband', 'noband', 'noband', 'noband', 'noband', 'noband', 'band', 'band', 'noband', 'noband', 'noband', 'noband', 'noband', 'noband', 'band', 'band')
)
str(bands)
## Classes 'tbl_df', 'tbl' and 'data.frame': 60 obs. of 5 variables:
## $ Blade_pressure : chr "20" "20" "30" "30" ...
## $ Roughness : chr "0.75" "0.75" "?" "0.312" ...
## $ Ink_pct : chr "50.5" "54.9" "53.8" "55.6" ...
## $ Ink_temperature: chr "17.0" "15.0" "16.0" "16.0" ...
## $ Band_type : chr "band" "noband" "noband" "noband" ...
# Check for missing values using base R and naniar functions
any(is.na(bands))
## [1] FALSE
naniar::any_na(bands)
## [1] FALSE
# What? No missing values! Take a closer glimpse
glimpse(bands)
## Observations: 60
## Variables: 5
## $ Blade_pressure <chr> "20", "20", "30", "30", "30", "28", "30", "28", "60...
## $ Roughness <chr> "0.75", "0.75", "?", "0.312", "0.75", "0.438", "0.7...
## $ Ink_pct <chr> "50.5", "54.9", "53.8", "55.6", "57.5", "53.8", "62...
## $ Ink_temperature <chr> "17.0", "15.0", "16.0", "16.0", "17.0", "16.8", "16...
## $ Band_type <chr> "band", "noband", "noband", "noband", "noband", "no...
# Replace ? with NAs: bands
bands <- naniar::replace_with_na_all(bands, ~.x == '?')
# Compute missingness summaries
naniar::miss_var_summary(bands)
## # A tibble: 5 x 3
## variable n_miss pct_miss
## <chr> <int> <dbl>
## 1 Blade_pressure 7 11.7
## 2 Roughness 4 6.67
## 3 Ink_pct 0 0
## 4 Ink_temperature 0 0
## 5 Band_type 0 0
# Visualize overall missingness
naniar::vis_miss(bands)
# Visualize overall missingness, clustered
naniar::vis_miss(bands, cluster = TRUE)
# Visualize missingness in each variable
naniar::gg_miss_var(bands)
# Missingness in variables, faceted by Band_type
naniar::gg_miss_var(bands, facet = Band_type)
## Warning: `cols` is now required.
## Please use `cols = c(data)`
# Visualize missingness in cases
naniar::gg_miss_case(bands)
# Impute with the mean
imp_mean <- bands %>%
naniar::bind_shadow(only_miss = TRUE) %>%
naniar::add_label_shadow() %>%
naniar::impute_mean_all()
## Warning in mean.default(x, na.rm = TRUE): argument is not numeric or logical:
## returning NA
## Warning in mean.default(x, na.rm = TRUE): argument is not numeric or logical:
## returning NA
## Warning in mean.default(x, na.rm = TRUE): argument is not numeric or logical:
## returning NA
## Warning in mean.default(x, na.rm = TRUE): argument is not numeric or logical:
## returning NA
## Warning in mean.default(x, na.rm = TRUE): argument is not numeric or logical:
## returning NA
## Warning in mean.default(x, na.rm = TRUE): argument is not numeric or logical:
## returning NA
# Impute with lm
imp_lm <- bands %>%
naniar::bind_shadow(only_miss = TRUE) %>%
naniar::add_label_shadow() %>%
simputation::impute_lm(Blade_pressure ~ Ink_temperature) %>%
simputation::impute_lm(Roughness ~ Ink_temperature) %>%
simputation::impute_lm(Ink_pct ~ Ink_temperature)
# Peek at the first few rows of imp_models_long
# head(imp_models_long)
# Visualize post-imputation distributions
# ggplot(imp_models_long, aes(x = imp_model, y = value)) +
# geom_violin(aes(fill=imp_model)) +
# facet_wrap(~variable, scales='free_y')
# Calculate post-imputation distribution stats
# imp_models_long %>%
# group_by(imp_model, variable) %>%
# summarize(var = var(value), avg = mean(value),
# median = median(value)) %>%
# arrange(variable)
# Peek at the cars dataset
head(cars)
## # A tibble: 6 x 12
## distance consume speed temp_inside temp_outside specials gas_type AC rain
## <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <dbl> <dbl>
## 1 28 5 26 215 12 <NA> E10 0 0
## 2 12 42 30 215 13 <NA> E10 0 0
## 3 112 55 38 215 15 <NA> E10 0 0
## 4 129 39 36 215 14 <NA> E10 0 0
## 5 185 45 46 215 15 <NA> E10 0 0
## 6 83 64 50 215 10 <NA> E10 0 0
## # ... with 3 more variables: sun <dbl>, `refill liters` <dbl>, `refill
## # gas` <chr>
# Boxplot of consume variable distribution
boxplot(cars$consume)
# Five-number summary: consume_quantiles
(consume_quantiles <- quantile(cars$consume))
## 0% 25% 50% 75% 100%
## 4 41 46 52 122
# Calculate upper threshold: upper_th
upper_th <- consume_quantiles["75%"] + 1.5 * (consume_quantiles["75%"] - consume_quantiles["25%"])
# Print the sorted vector of distinct potential outliers
sort(unique(cars$consume[cars$consume > upper_th]))
## [1] 69 71 74 79 81 87 99 108 115 122
# Scale data and create scatterplot: cars_scaled
cars_scaled <- cars %>%
select(distance, consume) %>%
scale() %>%
as.data.frame()
plot(distance ~ consume, data = cars_scaled, main = 'Fuel consumption vs. distance')
# Compute KNN score
cars_knn <- FNN::get.knn(data = cars_scaled, k = 7)
cars$knn_score <- rowMeans(cars_knn$nn.dist)
# Print top 5 KNN scores and data point indices: top5_knn
(top5_knn <- order(cars$knn_score, decreasing = TRUE)[1:5])
## [1] 320 107 56 62 190
print(cars$knn_score[top5_knn])
## [1] 4.322676 2.202246 1.927798 1.641515 1.365469
# Plot variables using KNN score as size of points
plot(distance ~ consume, cex = knn_score, data = cars, pch = 20)
# Scale cars data: cars_scaled
cars_scaled <- cars %>%
select(distance, consume, knn_score) %>%
scale() %>%
as.data.frame()
# Add lof_score column to cars
cars$lof_score <- dbscan::lof(cars_scaled, k = 7)
# Print top 5 LOF scores and data point indices: top5_lof
(top5_lof <- order(cars$lof_score, decreasing = TRUE)[1:5])
## [1] 165 287 186 320 80
print(cars$lof_score[top5_lof])
## [1] 4.172873 3.473775 3.352448 3.113629 3.058361
# Plot variables using LOF score as size of points
plot(distance ~ consume, cex = lof_score, data = cars, pch = 20)
Chapter 2 - Supervised Learning
Interpretable Models:
Regularization:
Bias and Variance:
Building Ensemble Models:
Example code includes:
car <- cars %>%
select(distance, consume, speed, temp_outside, gas_type, AC) %>%
mutate(gas_type=factor(gas_type), AC=factor(AC, labels=c("Off", "On")))
test_instance <- tibble::tibble(distance=12.4, consume=5.1, speed=45, temp_outside=5,
gas_type=factor("E10", levels=c("E10", "SP98")),
AC=factor("Off", levels=c("Off", "On"))
)
test_instance
## # A tibble: 1 x 6
## distance consume speed temp_outside gas_type AC
## <dbl> <dbl> <dbl> <dbl> <fct> <fct>
## 1 12.4 5.1 45 5 E10 Off
# Glimpse on the car dataset
glimpse(car)
## Observations: 388
## Variables: 6
## $ distance <dbl> 28, 12, 112, 129, 185, 83, 78, 123, 49, 119, 124, 118,...
## $ consume <dbl> 5, 42, 55, 39, 45, 64, 44, 5, 64, 53, 56, 46, 59, 51, ...
## $ speed <dbl> 26, 30, 38, 36, 46, 50, 43, 40, 26, 30, 42, 38, 59, 58...
## $ temp_outside <dbl> 12, 13, 15, 14, 15, 10, 11, 6, 4, 9, 4, 0, 10, 12, 11,...
## $ gas_type <fct> E10, E10, E10, E10, E10, E10, E10, E10, E10, E10, E10,...
## $ AC <fct> Off, Off, Off, Off, Off, Off, Off, Off, Off, Off, Off,...
# Build a multivariate regression model: car_lr
car_lr <- lm(consume ~ ., data = car)
# Summarize the model and display its coefficients
summary(car_lr)$coef
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 56.883105605 3.484721565 16.32357264 8.066425e-46
## distance 0.007590493 0.004953192 1.53244479 1.262404e-01
## speed -0.193808735 0.077206153 -2.51027578 1.247618e-02
## temp_outside -0.587521978 0.128868107 -4.55909529 6.927263e-06
## gas_typeSP98 0.443869591 1.818572662 0.24407581 8.073031e-01
## ACOn -0.098094250 3.344675402 -0.02932848 9.766179e-01
# Predict with linear regression model
predict(car_lr, test_instance)
## 1
## 45.31822
# Build a regression tree: car_dt
car_dt <- rpart::rpart(consume ~ ., data = car)
# Fancy tree plot
rattle::fancyRpartPlot(car_dt)
# Extract rules from the tree
rpart.plot::rpart.rules(car_dt)
## consume
## 32 when speed >= 22 & temp_outside >= 12 & distance < 29
## 36 when speed >= 25 & temp_outside < 12 & distance < 47
## 38 when speed >= 25 & temp_outside is 4 to 12 & distance >= 167
## 40 when speed >= 22 & temp_outside >= 12 & distance >= 29
## 43 when speed >= 25 & temp_outside < 12 & distance is 52 to 122
## 49 when speed >= 25 & temp_outside < 4 & distance >= 167
## 51 when speed >= 25 & temp_outside < 12 & distance is 122 to 167
## 61 when speed is 22 to 25 & temp_outside < 12
## 68 when speed >= 25 & temp_outside < 12 & distance is 47 to 52
## 75 when speed < 22
# Predict test instance with decision tree
predict(car_dt, test_instance)
## 1
## 36.41667
fifaRaw <- c(0.569, -1.555, -0.068, -0.705, -1.342, -0.28, -0.068, -1.98, -0.28, 0.357, -1.767, 0.357, -0.28, -0.918, -0.068, -0.068, 2.693, -0.918, 0.357, -0.068, -0.918, 0.144, -0.493, -1.767, -0.28, 1.419, 0.357, -0.28, 1.844, -0.493, -1.342, -1.342, -0.28, -1.13, 1.207, 2.269, -0.28, 1.419, 0.357, -0.068, -0.918, -0.918, 0.569, -0.28, 0.144, -0.28, -0.705, -0.28, 2.693, -1.342, 0.782, 0.357, -1.555, 0.994, 1.207, 0.994, 0.357, -0.918, -0.28, -0.493, -0.705, 0.144, -0.068, 1.207, -1.13, -0.918, -0.918, -0.068, -1.555, -0.068, 1.207, 0.357, -1.342, 0.569, -0.493, 1.631, -0.918, 0.994, 1.207, -0.068, 0.357, -0.705, 0.782, -0.068, 2.481, 1.631, -0.918, -0.918, -1.767, 1.631, 0.782, -0.28, 0.569, -0.705, 0.144, 2.481, -1.342, 0.782, 0.144, 0.144, -1.342, -0.493, -0.918, -1.555, 0.357, -1.555, -0.493, -0.493, -0.493, 0.569, -1.342, -0.28, 1.207, -1.555, -1.342, -0.068, 1.419, -1.555, 0.569, -0.493, 1.207, 0.144, -0.705, -0.493, 2.056, -1.342, 1.207, -1.13, 0.782, -0.068, -0.493, -1.13, -0.068, -0.493, -0.068, 0.994, -1.13, 0.357, -1.13, -0.068, -1.13, -0.705, -0.068, -0.28, -0.705, 0.782, 1.207, 1.631, 0.994, -1.13, -1.13, 0.994, 0.569, 0.994, 0.782, 0.144, 0.144, 0.144, -1.13, -0.068, -0.705, 0.144, 0.357, 0.782, 0.782, -0.705, -0.28, -0.705, 1.631, 2.056, 1.631, -0.068, -1.13, 0.144, -0.28, 1.207, -0.493, 1.419, -1.555, 0.782, -1.13, -1.13, -1.13, -0.068, 0.782, -0.918, 0.144, 0.569, -0.068, -0.493, 0.994, -0.918, -1.342, -0.493, -0.918, 0.569, -0.28, -0.705, 0.144, -1.342, 0.994, -1.555, 1.207, -0.28, 2.056, -1.13, -0.918, 1.631, 0.357, -0.705, -0.493, 2.056, -0.493, 1.844, 0.782, -0.068, 0.144, -0.705, -0.918, -0.28, 0.782, -1.555, -0.918, 2.056, 0.994, -0.493, -0.493, 0.144, -1.342, -1.342, 0.782, 0.994, 0.144, -0.28, 0.144, -0.493, -0.493, -0.918, 0.782, -1.342, 0.144, 0.144, -0.28, 1.419, -0.068, -0.068, 0.782, -1.767, -0.918, -0.068, -0.493, 0.357, -1.13, -0.28, 0.357, -1.13, -0.28, 0.994, -1.555, 1.207, -0.28, -0.705, -1.13, 0.144, -0.918, -0.705, 0.994, 0.357, 0.357, -0.705, -0.493, -0.493, 0.357, -0.28, -0.493, 0.569, -1.342, -0.068, 0.569, 2.269, -0.493, 1.419, 1.631, -1.13, 0.144, 2.056, -0.068, 0.994, 1.631, -0.068, 1.419, 2.481, 1.207, -0.28, 2.056, -1.555, -0.918, 0.782, 0.782, -0.493, 1.631, -0.28, -1.342, -0.918, -1.555, -0.068, 0.569, 1.207, -0.068, 0.569, -0.705, -0.28, -1.342, 1.631, 0.357, -0.068, -1.555, 0.357, 0.782, -0.705, -1.13, -0.493, -0.28, -0.918, 0.994, -0.493, -1.342, 0.357, 0.782, -0.918, -0.28, 1.419, 0.144, -1.13, -0.28, -0.28, 0.569, -1.342, -0.918, -1.342, 1.844, -0.068, -0.28, -0.068, 0.144, -0.28, -1.767, -0.28, -0.705, 0.144, -1.13, -1.13, -0.705, 0.569, -0.068, -1.342, 0.357, 0.144, 2.056, -0.705, 1.631, 0.782, -1.342, 1.419, -0.28, -0.28, 0.144, 1.419, 1.631, 0.569, -0.705, 2.056, -1.767, -0.918, -0.28, -1.555, -0.068, -0.068, 0.144, -1.555, -0.493, -0.068, -0.068, -0.705, -0.28, -1.13, -0.068, 1.419, 2.056, -0.493, -0.918, -0.705, -0.918, 1.419, -0.493, -0.28, 0.144, -0.493, -0.918, -1.342, -0.28, 0.357, -1.13, 1.631, 0.782, 0.357, 0.994, 0.782, -0.068, -0.918, -0.28, 1.844, -0.28, 0.782, 0.357, -1.13, -1.13, 0.569, 0.569, -0.068, 0.782, 0.357, 0.782, 0.144, 1.844, 1.207, 0.144, 0.357, 0.357, -1.13, -1.767, -0.068, 2.056, -1.342, 1.631, -0.068, 1.631, -0.068, 0.357, 1.419, 0.782, 0.569, -0.918, 0.569, -0.918, 1.419, -0.28, 0.569, 0.994, 0.357, 0.782, 0.357, 1.207, 0.782, -0.918, -0.493, -0.28, -1.98, 0.994, -1.342, -1.342, 0.357, 0.144, -0.493, -0.068, -1.342, -0.705, -0.918, 0.357, -1.555, 0.357, 1.419, 0.357, -0.068, 1.419, 1.631, 1.419, 1.419, 0.144, -0.493, 0.569, 1.844, 0.569, -1.13, -0.28, 1.631, 1.844, 1.207, -0.705, -1.555, -1.342, -0.705, -0.705, 2.056, 1.419, -0.918, -0.493, 0.994, -0.705, 0.782, -0.198, 1.373, -0.023, -0.198, -0.023, 2.246, -0.198, 0.152, -0.547, -0.547, 0.675, -0.721, -0.896, 0.501, 1.199, -1.943, -0.023, 2.595, -0.372, -2.467, -0.198, -1.245, -0.372, -0.023, -0.023)
fifaRaw <- c(fifaRaw, -1.419, -0.896, -0.372, -0.198, 1.548, -1.245, -0.198, -0.547, 1.024, 1.548, -0.721, -0.721, 0.152, -0.198, 0.501, 1.722, 0.85, -0.547, -0.198, 0.326, 1.024, 0.85, 0.326, -0.198, -0.023, 1.199, -0.721, 1.548, 0.501, -1.07, 0.152, 0.675, -0.198, -1.594, 0.501, -0.198, 0.85, -0.198, -0.198, -0.023, 1.024, -0.023, -1.07, 0.85, -0.547, -1.594, -1.07, 0.675, 3.293, 0.326, -0.023, 1.373, -1.245, -0.372, 1.373, 0.326, 0.501, -0.547, -0.896, 0.675, -0.721, -1.245, -0.547, 0.85, 0.326, 0.501, -0.372, -0.372, 0.675, -0.372, 0.152, 0.501, 0.675, -0.547, -0.896, 0.85, 0.85, 1.199, -0.547, -0.547, 0.501, -1.07, -0.023, 0.326, 0.326, 0.501, 0.501, 1.024, 0.152, 0.675, 1.024, -1.07, -0.198, -1.07, 0.501, -1.245, -0.198, 1.024, 1.548, -0.547, -1.07, 0.326, 1.722, 0.501, -0.721, -0.547, -0.198, -0.547, -0.896, -0.198, -1.245, -0.198, -1.594, -1.768, -0.023, 1.897, 0.152, 2.421, 1.199, -1.07, -0.023, -0.023, 0.85, -0.023, 0.675, -0.198, -0.896, -0.547, -0.023, 0.675, -0.372, -0.896, 0.326, 1.024, 0.85, 0.501, -0.198, -1.245, -0.547, -0.372, 0.85, -0.896, 2.071, -0.547, -0.198, -1.245, 0.326, 0.326, -0.721, -0.198, -1.245, 0.152, -0.896, 0.501, -1.07, -0.372, -2.292, 0.326, 0.501, -1.07, 1.199, 0.152, -1.07, -1.943, -1.245, -0.198, 1.199, -1.768, 0.326, -1.245, -0.721, 0.326, 0.326, 0.85, 1.548, -0.023, -1.245, -1.07, -0.023, -0.547, 0.326, 1.024, -0.198, 0.326, 1.199, 0.85, -0.023, 1.199, -1.419, 1.199, -0.896, -0.023, 0.501, 0.152, -2.118, -0.198, -0.023, -0.023, -1.419, 1.024, 0.85, 0.501, -1.07, 0.85, -0.198, -1.245, -0.721, -0.721, 1.897, 0.326, 0.152, 1.024, -0.198, 0.326, 0.501, -0.198, -1.07, 1.199, -1.245, 0.675, -0.721, -0.547, 0.152, -1.07, -1.245, 0.85, -0.023, 1.373, 1.024, -0.547, -0.372, 1.024, -0.547, -0.372, -2.118, -1.768, -1.07, 1.024, -2.816, 1.024, 0.501, -0.372, -0.372, 1.897, 0.152, -0.721, 2.944, 1.024, 1.199, -0.721, -1.594, -0.896, -0.372, 0.85, -1.245, -0.372, 0.501, -0.547, 2.071, 1.548, -0.547, 1.548, -0.023, 1.024, -0.721, -0.198, 0.675, 0.85, -1.245, -1.594, 0.85, -1.419, 0.675, 0.675, -0.721, -1.768, 3.293, 0.326, 0.326, 1.024, -0.547, 0.326, 2.246, -0.198, -0.547, 1.373, 0.152, 1.199, 1.024, 0.152, -1.419, -0.372, -0.023, -0.372, -0.372, -0.023, 1.373, 1.024, 0.326, 0.675, 2.246, -0.547, 1.548, -0.372, -0.547, -0.023, -0.198, -1.07, 0.326)
fifaRaw <- c(fifaRaw, 0.675, 0.152, -1.419, 1.373, -0.023, 0.152, -0.547, 1.373, -0.023, 0.501, -0.198, -0.023, 0.501, 0.152, 1.199, -1.07, 2.421, -1.07, 1.024, 0.326, -0.198, -1.245, -1.245, -0.198, -0.372, -2.467, 0.501, 0.152, -0.023, -0.198, 0.326, -0.896, 0.501, 1.897, -0.547, -0.721, 0.501, -0.372, 1.373, 0.675, 0.326, 0.152, -1.07, 0.675, -0.198, -0.198, 0.152, -0.721, -0.198, -0.372, -0.023, -0.896, 0.675, -0.198, 0.675, 1.373, -1.594, 1.373, 0.326, -1.768, 1.722, 1.024, 0.326, 1.548, 1.722, -1.245, -2.292, 0.675, -0.547, -1.07, -0.547, -0.721, -1.419, 0.85, 0.501, 0.501, 0.501, -1.943, 1.199, -0.896, 0.152, 2.071, 0.152, -0.372, -0.721, 2.071, -0.896, -1.245, -1.07, -0.198, -2.292, -1.245, -0.023, -0.547, 0.501, -0.547, 0.85, -0.372, -0.372, -0.547, -0.023, -0.547, 0.675, -0.023, 0.85, 0.326, -0.198, -1.07, 0.152, 0.326, -0.372, -1.419, -1.245, 0.501, -0.372, -1.07, -0.896, -0.198, -0.023, 1.199, 1.373, -0.896, -0.896, 0.675, 0.326, -1.594, -0.372, -0.721, -0.547, 1.722, -0.547, -1.245, 1.722, 0.501, 0.326, 1.548, 0.675, -1.768, -0.372, -1.245, -1.245, -0.721, -1.245, 3.468, 0.675, -0.023, 0.152, -1.768, -0.198, -0.198, -1.594, 0.675, 0.501, -1.245, 0.85, 1.548, -0.023, 1.897, 0.675, -1.419, 0.85, -1.419, 1.199, 1.199, -0.372, -1.594, 0.618, 0.095, 0.409, -1.841, 0.042, -1.789, -0.586, -0.167, 0.775, 0.566, -0.167, 0.775, 0.461, 0.147, 1.351, -1.056, 1.508, 0.827, 0.775, 0.513, -1.475, 0.461, -1.789, -0.115, 0.775, -1.894, 0.88, 0.88, 1.194, 0.461, -0.69, 0.356, 0.461, 0.513, -1.632, 0.618, 0.566, 0.566, -0.586, 0.513, -1.004, 0.461, -1.737, 1.141, 1.194, 0.984, 0.042, 0.409, -1.946, -1.318, 1.455, 0.618, -0.01, -1.841, -0.481, -1.841, 1.351, -1.423, -1.946, 0.723, 0.461, 0.618, 0.513, 0.775, 0.304, 0.147, 0.618, -1.004, 0.827, 0.775, -0.219, 0.618, 0.67, 2.031, 0.042, -2.051, 0.409, 1.351, 0.618, -0.376, -0.324, -1.109, 1.037, -1.475, -1.998, 0.513, -0.272, 0.252, -0.952, 0.095)
fifaRaw <- c(fifaRaw, -0.062, 0.199, 0.775, -0.01, 0.984, 0.409, -0.899, 1.194, 0.67, -0.324, -0.743, -0.115, 0.827, -1.318, 1.089, -1.423, 0.252, 0.566, -0.847, 1.665, -1.894, -1.318, 1.037, -1.998, -0.952, 1.298, 0.67, 0.147, 0.042, -0.272, 0.88, 1.246, -1.946, 1.141, -1.946, -1.318, -0.272, 1.298, 0.042, -1.58, 0.566, -1.109, 0.356, 0.513, 0.095, 0.147, 0.566, -1.527, -0.062, -1.266, 0.67, -0.167, 1.56, 0.67, 0.304, 0.984, 1.194, -1.946, 1.141, 0.199, -0.272, 0.932, 0.409, -0.167, 1.037, 0.827, 0.67, 0.984, -0.01, -1.37, 0.67, 0.566, 0.723, 0.67, -2.051, 0.67, -0.899, -1.841, -1.737, -1.946, 0.566, 0.618, 0.409, -1.737, 0.252, 0.409, 0.095, 0.566, -0.429, 0.252, 0.461, -2.051, 0.252, 0.67, 0.775, -0.586, 0.566, 0.199, -0.01, 0.304, -0.376, -0.167, -1.894, -1.841, -1.056, 1.037, 0.199, 0.513, 0.618, -2.051, -1.894, -0.115, 0.932, 0.984, -0.272, 0.042, -0.062, 1.351, 1.351, 1.612, 0.827, -0.795, 1.089, 0.723, 1.298, 0.566, 1.141, -1.056, 0.304, 0.095, 0.88, -0.69, 0.356, 0.775, 1.403, -0.743, 0.566, 0.67, 0.042, -1.894, 0.513, 0.618, -0.219, 0.461, -1.841, 0.775, 0.827, -2.051, 0.147, 0.409, -1.789, 0.252, 0.827, 0.199, 0.409, 0.356, -0.952, -1.109, -0.586, 0.618, 0.984, -0.115, -0.69, 0.513, -0.899, 0.461, -0.638, 0.932, -1.946, -0.167, 0.304, -0.899, -1.998, 0.461, -0.219, 0.618, 0.304, 1.141, 0.618, 0.984, -1.056, -0.115, 0.409, -2.051, 0.409, 0.304, -0.533, -0.219, -0.376, -1.998, 0.356, 0.513, -2.155, -0.533, 0.042, 1.194, 1.246, 0.932, 0.827, -1.789, 0.775, 0.618, 1.455, -0.219, -1.841, -1.998, 0.199, 0.513, 0.095, 0.461, -1.632, 1.508, 0.461, -1.998, 1.141, -1.37, 1.298, 1.978, 0.723, 0.984, 0.723, 0.042, -0.795, 1.508, -0.324, 0.409, -1.004, 0.984, 1.194, -0.481, 0.775, -0.481, -1.998, 0.932, 0.356, 1.455, -0.533, 0.095, -0.324, -1.423, -0.533, -1.737, 0.461, 0.147, -1.998, 0.775, 0.566, -1.109, -0.01, -1.946, 0.042, -0.952, 0.618, 1.351, -0.69, 0.304, -1.056, -0.167, 1.403, 0.513, 1.298, -0.638, 0.304, 1.298, -1.737, -0.69, 1.037, -1.161, -1.789, -0.899, 1.403, -0.272, 0.252, 0.356, 0.88, -0.272, 0.827, 0.147, 0.461, -0.795, 0.932, -1.841, -0.69, -1.946, 0.618, 0.199, 0.775, 0.67, 0.775, -0.481, -0.01, -1.004, 0.409, -0.272, 0.199, -0.062, -1.841, 1.717, -1.266, 0.566, 0.304, 0.042, 0.095, -2.208, 0.566, 0.199, 0.827, 0.618, 0.513, -0.272, -1.58, 0.67, 0.304, -0.115, 1.298, 0.618, 0.461, 1.298, -1.737, 0.513, 1.141, 0.67, 0.88, 0.356, -0.69, 0.147, -1.004, 0.827, 0.461, 1.926, 0.461, 0.67, 1.194, -0.69, -1.841, 0.775, 0.775, 0.88, 1.141, 0.042, -1.894, -0.743, -1.998, -0.376, 0.67, -0.743, 0.827, -1.004, 1.194, -0.01, 1.141, 1.037, 0.304, -0.167, -0.847, 0.513, 0.461, 1.194, -1.894, 0.304, 0.042, 0.984, -0.219, -1.946, -1.632, -1.266, 0.775, 0.147, 0.618, -0.638, -1.161, 0.88, 0.461)
fifaRaw <- c(fifaRaw, 0.147, 0.095, -0.272, 0.88, 0.566, -1.998, -0.429, 1.351, 0.304, 0.723, -1.213, -1.475, -1.789, -1.737, 0.88, 0.775, -1.109, 0.199, 0.513, 0.461, 0.618, -0.115, 0.88, 1.351, 0.199, -0.638, -0.69, 0.252, 0.409, 0.618, -2.051, -0.01, -0.481, 0.513, 1.612, 0.67, -1.318, 1.232, 0.475, 1.081, -1.242, -0.636, -1.646, -0.283, 0.576, 0.475, 0.02, -0.485, 1.182, 0.525, 0.273, 1.182, 0.879, 1.131, 1.485, 0.879, -0.182, -1.444, -0.03, -1.646, -1.091, -1.04, -1.444, 0.778, -0.485, 0.273, 0.323, -1.444, -0.586, -0.586, -0.535, -1.697, 1.081, 0.576, -1.242, -0.737, 0.424, -0.434, -0.283, -1.798, 1.03, 0.02, 1.434, 1.333, 0.374, -1.697, -0.081, 1.384, 0.424, 0.576, -1.646, 0.677, -1.495, -0.182, -0.788, -2, 1.131, 0.828, -0.182, 1.03, -0.384, 0.626, 0.828, 0.172, -1.192, 0.172, 1.182, 0.828, 1.03, -0.03, 1.586, 0.02, -1.646, 1.03, 0.576, -0.788, 1.535, -0.737, -1.04, -0.081, 0.929, -1.697, 1.131, -1.444, -1.242, -0.99, -0.081, -0.737, 0.879, -0.485, 0.576, 1.182, 1.535, -0.889, 1.182, -0.737, 1.081, 1.081, 0.778, -0.384, -1.343, -0.737, -1.04, -0.687, 0.626, -0.232, 0.626, -1.848, -0.434, 0.071, -1.848, -1.394, 0.727, 0.475, 0.525, 1.131, 1.283, 0.02, 0.929, -1.293, 0.929, -1.646, -1.04, -0.485, 0.879, -0.434, -1.293, 0.677, -0.939, -0.384, 0.576, 1.182, 0.071, 0.677, -1.697, -0.788, -1.394, 0.98, -0.081, 1.333, 1.081, 0.626, 0.02, 0.778, -1.697, 1.03, 0.374, 0.929)
fifaRaw <- c(fifaRaw, 0.929, -0.838, -0.636, 0.475, 0.98, 0.879, 0.424, 0.778, -1.394, 0.525, 1.434, -0.182, -1.04, -1.444, 0.929, 1.232, -1.495, -1.495, -1.646, 0.424, 1.232, -0.636, -1.747, -0.586, 0.576, 1.182, 0.677, 0.323, -0.131, 0.172, -1.899, 0.929, -0.485, 0.626, 1.232, 0.626, -0.535, -0.384, -0.232, -1.596, 0.98, -2, -1.343, -1.04, 0.475, 1.182, -0.737, 0.727, -2, -1.697, 0.071, 0.626, -0.687, 1.131, -0.485, 0.98, 0.727, 1.283, 1.232, 1.081, -0.485, 0.424, 0.273, 1.687, 0.121, 0.273, -1.192, -0.737, -1.192, 0.677, -0.788, 0.172, -0.283, 0.475, -1.343, 0.071, 0.98, -0.131, -1.798, -0.485, -0.03, 0.929, -0.03, -1.242, 0.677, 1.283, -1.848, -1.091, 0.576, -1.545, 0.374, 1.586, 0.626, 0.424, 0.778, -0.838, 0.727, -0.99, 0.475, 1.081, 0.374, -0.838, 0.323, -0.99, 0.525, -1.091, 0.879, -1.899, 0.374, 0.172, -1.141, -1.848, -0.636, -0.687, -0.636, 1.434, -0.384, 0.02, 0.727, 0.727, -0.384, 1.636, -1.747, 0.677, -0.636, 0.525, 0.677, 1.384, -1.596, -0.384, 1.131, -1.747, 1.434, 1.838, 1.182, -0.636, 0.576, 1.788, -1.848, 0.475, 0.525, 0.778, 0.677, -1.545, -1.899, 0.626, 0.323, 1.03, 0.778, -1.596, 1.788, -0.384, -1.899, 0.02, -1.242, 1.081, 1.687, -1.293, -0.586, 1.232, -0.636, -0.889, 1.081, 0.475, 0.626, 0.576, 0.172, 0.778, 0.677, -0.939, 1.131, -1.545, 0.525, -0.889, 1.485, -0.636, -0.131, 1.182, -1.141, 1.182, -1.394, 0.879, 1.03, -1.747, 0.02, -1.192, -1.192, 0.475, -1.899, -0.03, -0.535, 1.03, 0.98, -0.384, 1.384, -1.394, -0.737, 0.374, 0.071, 0.828, 0.172, 0.222, 0.98, -1.394, -0.182, -0.081, -0.687, -1.646, 0.576, 0.727, 0.121, -0.485, -0.687, 0.98, -0.03, 1.333, -0.636, 0.424, 1.131, 0.879, -1.646, 1.081, -1.697, 0.626, -1.141, 0.525, -0.434, 1.333, 0.323, 1.03, -0.939, -0.232, 0.677, 0.323, -0.636, -1.444, 1.131, -0.434, 1.586, -0.838, -1.141, -0.485, -1.242, 0.02, -0.03, 0.374, -0.838, 0.98, -1.141, -1.394, 1.485, -0.232, 0.222, 0.121, 0.424, 0.677, -0.081, -1.848, -0.99, 0.273, 0.525, 0.727, 0.828, -0.131, -0.939, 0.778, 0.727, 1.182, 0.727, 0.626, 0.677, 0.879, 1.232, -1.596, -0.131, 1.232, 0.626, 1.384, 0.172, -1.848, -0.03, -1.848, 0.222, 0.778, 1.182, 0.828, -1.04, 0.525, -0.283, -0.485, 0.071, -0.737, 0.727, -1.242, -0.283, 0.576, 1.081, -1.747, 0.071, 1.182, 0.071, 1.333, -1.495, -1.242, 0.98, -1.04, -0.283, 1.434, -0.737, 0.626, 0.879, 0.172, 1.434, -0.636, -0.434, 1.182, 0.475, -1.495, 1.03, 0.576, 0.222, 0.475, -0.788, -1.242, -1.242, -1.545, 0.778, 1.081, -1.242, 1.384, -0.939, 0.626, 0.828, -0.434, 1.687, 1.384, 0.222, -0.283, -1.747, -0.182, 1.535, 0.778, -2.05, 0.677, -0.636, 0.273, 1.535, 0.071, -1.394, 0.467, 0.189, 0.801, -1.867, -0.033, -2.2, 0.69, -0.366, 0.301, -0.366, -0.644, 0.189, -0.144, -0.255, -0.533, 0.301, 0.467, 0.245, 0.856, -0.7, -1.867, -0.311, -1.756, -0.589, -0.033, -2.089, 0.023, 0.467, 0.634, -0.2, 0.078, -0.311, 0.356, -0.311, -1.811, 0.578, -1.033)
fifaRaw <- c(fifaRaw, 0.578, 1.19, 0.356, 1.023, 0.078, -2.033, -0.811, 0.189, -0.589, 1.301, 0.189, -2.256, -1.033, -0.144, -0.533, -0.644, -2.145, 1.301, -1.589, 0.301, 0.69, -1.2, 0.245, -0.644, 1.19, 1.023, 0.134, -0.144, -0.088, -0.866, 0.301, -0.255, 0.467, 0.467, 0.634, -0.644, 0.134, 0.578, -2.089, -0.255, -0.477, 0.856, 1.523, 1.412, 0.634, -0.366, 0.69, -2.2, 0.301, -0.2, -0.033, 0.801, 1.245, 1.19, 0.801, -0.033, -0.533, -0.033, 1.856, 0.356, -0.644, 0.412, 0.467, 0.801, -0.477, 0.578, -0.033, 0.134, 0.523, 0.023, -0.533, 0.856, 1.023, -2.2, 0.967, 0.023, -2.2, 0.301, 0.189, 0.912, -0.255, 0.245, 0.467, 0.523, 0.023, -1.756, -0.088, -2.256, -0.477, 1.301, 0.412, 1.134, -2.145, -0.255, 0.189, 0.301, -1.644, 1.19, 0.523, -0.033, -2.2, 0.078, 1.078, 0.412, 0.523, -0.422, 0.856, -1.089, 0.912, 0.356, -2.145, 1.245, -0.589, 0.745, 0.467, 0.745, 1.078, 0.245, 0.578, -0.2, 1.245, 0.634, 0.801, 0.856, 0.301, 0.69, 0.801, -1.756, -0.033, 1.412, -2.089, -2.089, -2.145, 0.023, -0.255, -0.255, -2.256, -0.755, -0.2, 0.245, 0.134, -0.255, 0.523, -0.644, -2.256, -0.7, 0.912, -0.311, 0.523, 0.356, 0.745, -0.366, 0.134, 1.023, 0.967, -2.145, -2.033, 0.301, -0.589, 0.801, 0.078)
fifaRaw <- c(fifaRaw, -0.922, -2.422, -2.033, -0.422, -0.589, -0.533, 1.356, -0.422, 0.634, 0.412, 0.856, -0.255, 0.301, 1.134, 0.856, 0.301, 0.912, -0.311, 0.023, 1.19, -0.088, -0.422, 0.578, 0.023, -0.866, 0.467, 1.412, 0.745, -0.978, 0.356, -0.755, -2.311, 0.356, 1.134, 0.69, 0.967, -1.756, -0.533, -0.811, -2.089, 1.245, 0.301, -1.867, -0.7, 0.745, 0.578, 1.023, -0.755, 1.134, -0.144, -0.477, 0.245, -0.422, 0.412, 0.856, 0.356, 0.856, 0.356, 1.356, 0.245, -1.756, -0.311, -0.811, 0.467, -2.2, -0.422, 0.412, 0.189, 0.356, 0.412, 0.356, 0.078, 0.69, 1.634, 1.69, -1.922, 0.523, -0.033, -0.644, 0.134, 1.19, -2.089, -0.088, 0.967, -2.2, 0.801, 1.301, 1.023, 0.69, 0.245, 1.523, -2.256, 0.412, 0.189, 0.245, 0.467, -1.922, -2.2, -0.033, 0.912, 0.245, -0.422, -1.922, 1.412, -0.477, -2.089, 0.301, 0.801, -0.589, 0.412, 0.023, 0.578, 0.245, 0.745, 0.189, 0.356, 0.967, -0.755, -0.033, 1.023, -0.644, -0.033, -0.144, 0.634, -2.256, 0.189, 1.579, -0.033, 0.078, 1.19, 1.023, 0.078, 0.856, -1.978, -0.589, -0.144, -2.2, 0.578, 0.356, 0.967, -0.033, -2.2, 1.134, 1.134, 0.912, 0.134, 0.523, -0.311, 0.189, 0.578, 0.412, -0.144, 1.245, -0.255, 0.078, -0.422, -2.256, -0.811, -0.144, 0.912, -2.256, 0.356, 0.634, 1.245, -0.144, 1.023, 0.023, 0.023, 0.801, 1.579, 0.634, 0.912, 0.412, -1.756, 0.801, -1.867, -0.922, -0.533, 0.245, 0.856, -0.255, -0.644, -0.2, 0.023, 0.801, -0.422, 0.745, -0.033, -2.256, 1.023, 1.467, 0.745, -0.033, 1.023, 0.412, -1.2, 1.023, 1.245, 0.69, 0.69, 0.301, -0.589, -2.033, 0.134, 0.467, 0.856, -0.033, 0.634, 0.189, 0.189, -1.811, 0.301, 0.356, 0.412, 0.356, 0.356, 0.912, 0.801, 0.801, 0.412, 0.245, 0.69, 0.801, 0.301, -0.866, 1.023, -1.922, 0.189, 0.745, 0.523, 0.356, -0.311, -2.256, 0.912, -2.311, -0.477, 0.301, 0.745, 0.967, 1.078, 0.801, 1.301, 0.634, -0.2, 0.634, 0.801, 0.578, 0.189, -0.533, 0.245, -2.2, 0.578, 1.412, 0.023, 1.301, -1.978, -2.2, 0.523, -0.2, 0.467, 0.856, -0.589, -0.422, 0.023, 0.412, 1.19, 0.912, -0.422, 1.634, -0.7, -2.2, 0.578, 0.467, -0.755, 0.023, 0.634, -1.7, -1.756, -1.978, 2.19, -0.366, 0.912, 1.301, 0.467, -0.033, -0.7, -0.811, 1.467, 1.412, -0.2, 1.134, 0.578, -0.811, 1.078, 0.134, -2.534, 1.023, -0.477, -0.311, -0.144, 0.467, -0.533, 0.061, 0.26, 0.458, -2.386, 0.26, -2.121, -0.137, -0.203, 0.193, 1.318, 0.26, 0.723, 0.723, 0.392, 1.252, -1.063, 1.119, 0.921)
fifaRaw <- c(fifaRaw, 0.855, -1.063, -1.923, 0.656, -0.865, -1.261, 0.127, -2.055, 0.326, 0.524, 0.656, 1.252, -0.931, -1.658, 0.061, 0.656, -1.592, 0.193, -0.005, 0.127, -0.005, 1.053, 0.326, 0.789, -0.732, 0.458, 0.656, 0.656, 0.127, 0.855, -1.923, 0.193, 1.45, 0.392, 0.458, -1.658, 0.59, -1.923, 0.855, -0.071, -1.46, 0.524, 0.524, 0.789, 0.392, 0.723, 0.127, 0.855, -0.005, -1.394, 0.656, 0.458, -0.269, -0.203, 0.26, 2.243, 0.59, -1.724, 0.921, -0.203, 0.723, 0.59, 0.392, -0.666, 0.127, -0.005, -2.187, 0.127, -1.658, -0.666, -1.526, 0.458, 0.59, 0.26, 0.127, 0.921, 0.524, 0.789, -1.195, 0.921, -0.6, -0.402, -0.203, 0.789, 0.458, -1.261, 0.061, -0.402, -1.989, 0.392, 0.193, 1.516, -2.452, -0.931, 1.119, -2.253, -0.6, 0.723, 0.656, -0.534, -0.137, -0.005, 0.458, 0.789, -0.137, 0.855, -2.319, -1.658, -0.005, 1.384, 0.789, -1.46, -0.005, -0.6, -0.6, -0.137, 0.458, 0.193, 0.127, -2.386, 0.26, -0.203, 0.987, 0.59, 1.45, 0.987, 0.193, 0.326, 1.053, -1.327, 0.921, 0.458, -0.203, 0.193, 0.127, 0.392, 1.185, 0.26, 0.127, 1.053, 0.59, -0.203, 0.723, 0.326, 0.193, 0.193, -1.857, 0.59, -0.6, -1.724, -2.055, -1.195, 0.656, 0.326, -1.658, -1.526, -0.137, 0.458, 0.326, 0.524, 0.392, 0.326, 0.127, -2.452, 0.326, 0.392, 0.392, 0.127, 0.921, 0.061, 0.127, 0.392, -0.6, 0.789, -2.187, -2.716, -1.526, 0.987, -0.071, 0.127, 1.252, -0.931, -0.402, 0.127, 0.458, 0.392, 0.524, 0.061, 0.193, 1.252, 0.656, 1.252, 0.458, 0.061, 0.458, 0.127, 1.119, 0.26, -0.005, -0.071, 0.061, -0.005, 1.053, -0.6, 0.392, 0.656, 1.252, 0.127, 0.921, 0.193, 0.26, -2.386, 0.127, 0.392, -0.005, 1.318, -2.187, 0.458, 0.921, -2.253, 0.656, 0.193)
fifaRaw <- c(fifaRaw, -2.055, 0.524, 1.318, 0.326, 0.656, 0.789, -0.005, -0.732, -1.394, 0.061, 0.392, 0.656, 0.26, 0.855, -0.269, 0.326, -0.269, 0.921, -2.319, 0.326, -0.071, -0.402, -2.187, -0.336, 0.921, 0.326, 0.723, 0.59, 1.45, 0.326, -1.394, 0.656, 0.723, -1.526, 0.524, -0.666, -0.203, 0.458, 1.053, -1.658, -0.402, 0.656, -1.857, 0.392, 0.921, 0.789, 0.855, 0.789, 1.252, -2.319, 1.119, 1.053, 1.185, -1.129, -1.195, -2.518, -0.666, 1.185, 1.252, 0.326, -2.452, 1.516, -0.071, -2.452, 0.656, -0.6, 0.921, 1.714, 0.458, -0.137, 0.458, -0.336, 0.524, 1.318, 0.656, -0.137, -0.997, 0.59, 1.053, -0.402, -1.195, 0.326, -0.666, -0.005, 0.326, 1.185, -0.137, 0.855, -0.203, -1.658, 0.326, -1.989, -0.666, -0.336, -2.386, 0.127, -0.269, 0.524, 0.59, -2.187, 0.392, 0.524, 0.524, 0.987, 0.061, 0.458, -1.658, 0.855, 0.855, 0.458, 0.987, -0.137, 0.723, 1.252, -0.865, 0.061, 0.326, 0.127, -1.526, -0.997, 0.921, 0.656, 0.26, -0.137, 0.789, 0.26, 1.119, 1.185, 0.656, -1.195, 0.789, -1.46, -0.402, -2.782, 0.855, -0.534, 0.26, 1.053, 0.524, -0.732, 0.193, -0.269, 0.392, 0.326, 0.193, -0.005, -2.055, 0.987, -0.865, 0.326, -0.798, 0.855, -0.137, -1.989, 0.458, -0.203, 0.26, 0.789, 1.252, -0.997, -2.386, 0.921, 0.326, 0.326, 0.458, 0.723, 0.458, 1.384, -2.253, -0.005, 0.723, 0.061, 1.582, 0.326, 0.127, 0.723, -0.666, 0.524, 0.524, 1.318, 0.656, 0.193, 0.26, 0.458, -2.848, 0.127, 0.921, 0.326, 1.318, 0.392, -1.989, 0.789, -2.253, -0.865, 1.185, -0.732, 1.318, -0.137, 0.855, 0.458, 0.193, 0.061, 0.392, 1.252, -0.402, 0.061, 0.193, 0.987, -2.716, 0.326, -0.203, 0.59, 0.59, -1.592, -1.79, -1.394, -0.402, 0.524, 0.392, -0.269, -0.865, 0.458, 0.59, -0.402, 0.26, 0.127, 0.987, 0.524, -1.923, 0.127, 0.723, -0.137, 0.458, -0.666, -0.468, -2.716, -2.716, 1.318, 0.987, -0.071, 0.59, -0.071, 0.789, 0.061, 0.326, 0.723, 0.987, 0.26, -0.137, -1.195, -0.666, 0.723, 0.26, -2.584, 0.987, 0.061, 1.053, 1.384, -0.005, -1.46, 1.011, 0.622, 0.733, -1.488, -0.044, -1.599, -0.877, 0.456, -0.266, 0.345, -0.044, 0.345, 1.289, 0.456, 1.177, 0.011, 1.233, 1.622, 1.177, -0.933, -1.544, 0.289, -1.599, -0.711, -0.6, -1.544, 0.955, 1.066, 0.9, 0.511, -0.766, -0.6, -0.377, -0.211, -1.321, 1.233, 0.511, -1.544, -1.099, 0.789, -0.655, 0.233, -1.821, 0.011, 0.178, 1.066, 1.455, 0.289, -1.655, 0.011, 1.122, -0.322, 0.233, -1.655, 0.178, -1.266, 0.289, -1.099, -2.043, 0.122, 0.955, -0.488, 0.622, -0.044, 0.178, -0.6, 0.233, -0.877, 0.567, 1.344, 1.122, 0.844, 0.567, 2.233, 0.4, -1.377, 0.789, 0.955, 0.122, 1.4, -0.655, -1.21, 0.011, 0.233, -1.544, 1.233, -0.711, -0.766, -0.766, 0.233, 0.4, 0.067, -0.933, 0.9, 0.955, 2.288, -0.6, 1.066, -0.711, 1.455, 0.567, 0.067, -0.266, -0.877, -0.766, -0.655, -0.877, 0.122, -0.155, 1.622, -1.821, -1.155, -0.1, -1.821, -0.933, 0.955, 1.122, 0.733, 1.122, 0.733, -0.488, 0.456, -1.21, 0.567, -1.599, -1.21, -0.433, 0.955, 0.4, -1.21, 0.178, -0.711, -0.711, 0.345, 0.4, 0.233, 0.345, -1.655, -0.322, -0.766, 0.789, -0.155, 1.455, 0.345, 0.289, -0.988, 0.456, -1.655, 1.4, 0.678, 0.456, 1.344, -0.1, -1.599, 0.456, 0.511, 0.844, 1.011, -0.155, -0.711, 0.511, 1.177, -0.377, -0.6, -2.099, 0.511, 0.678, -1.266, -1.488, -1.599, 0.178, 1.066, -0.822, -1.766)
fifaRaw <- c(fifaRaw, -0.544, 0.844, 1.289, 1.011, 0.178, -0.1, 0.289, -1.821, 0.733, -1.044, 1.233, -0.044, 1.289, -0.655, -0.6, 0.011, -1.044, 1.289, -1.932, -1.488, -0.544, 0.233, 0.733, -0.766, -0.044, -1.877, -1.766, -0.044, 0.622, -1.044, 0.733, -0.655, 1.566, 0.122, 1.622, 1.4, 1.011, -0.488, -0.266, 0.289, 2.01, 0.678, 0.789, -0.377, -0.711, -0.544, 0.844, -0.488, -0.211, -0.711, 1.622, -1.044, -0.044, 0.844, 0.178, -1.821, -0.6, -0.711, 0.733, 0.678, -1.655, 1.066, 1.566, -1.988, -0.766, 0.456, -1.599, 0.122, 1.344, 0.622, 1.844, -0.155, -0.877, -0.322, -0.6, 1.122, 0.067, -0.433, -0.433, 1.177, -1.266, 0.289, -0.877, 1.455, -1.988, -0.155, -0.211, -0.711, -1.988, -0.211, -0.044, -0.6, 0.678, -0.044, 0.4, 0.178, 0.622, -0.655, 1.844, -1.766, 0.4)
fifaRaw <- c(fifaRaw, -0.988, 0.067, 0.844, 1.733, -1.544, -0.377, 1.177, -1.599, 0.955, 1.677, 1.788, -0.711, 1.233, 2.344, -1.766, 0.067, 0.955, 0.844, 0.233, -1.488, -1.988, 1.122, -0.544, -0.6, 0.4, -1.377, 1.899, -0.377, -1.821, 0.511, -0.711, 1.677, 1.955, -0.988, -0.433, 1.289, 0.011, -0.377, 1.788, 0.011, -0.266, -0.433, 0.289, 1.455, 0.067, -0.822, 0.678, -1.544, -0.544, -1.377, 2.233, -0.322, -0.266, 1.455, -0.766, 0.678, -1.488, 0.511, 0.9, -1.821, -1.599, -0.933, -0.488, 0.289, -1.932, -0.211, -0.488, 0.955, 0.733, -0.433, 1.177, -0.822, -0.822, -0.1, 0.122, 1.122, -0.711, 0.345, 1.289, -1.488, -0.155, -0.044, -0.766, -1.21, -0.211, 0.844, 0.067, -0.711, -0.766, 0.733, -0.488, 1.566, -0.377, 0.511, 1.566, 0.844, -1.377, 0.345, -1.71, 0.789, -1.21, 0.678, -0.1, 0.955, -0.266, 0.789, -1.099, -0.711, -0.155, 0.622, -0.766, -1.544, 1.899, -0.377, 1.233, -0.488, -0.377, -0.655, -1.21, -0.377, 0.345, 0.955, 0.622, 0.289, -0.988, -1.321, 1.788, -0.1, 0.456, 0.9, 1.4, 0.678, 0.511, -1.71, -0.766, 0.289, -1.044, 0.178, 0.9, -0.155, -0.044, -0.044, 0.511, 1.289, 1.4, 1.066, 0.622, 1.011, 1.289, -1.544, -0.433, 1.4, 0.233, 0.955, -0.155, -1.988, -0.544, -1.544, 0.678, 1.511, 0.511, 1.011, -0.877, 0.067, -0.433, 0.622, -0.266, -0.655, 0.955, -1.155, 0.4, 0.289, 1.011, -1.655, 0.122, 1.011, 0.233, 1.289, -1.544, -1.432, 0.289, 0.4, -0.155, 1.455, -0.6, -0.044, 0.9, 0.233, 1.566, -0.711, -0.488, 1.4, -0.044, -1.599, 0.289, 0.9, -0.322, 0.011, -0.655, -1.21, -1.377, -1.599, 1.344, 1.011, -0.766, 1.122, -1.155, 0.4, -0.211, -0.155, 1.788, 1.066, -0.044, -0.1, -0.655, -0.655, 1.733, 0.345, -1.766, 0.678, -0.544, -0.1, 1.677, 0.067, -0.766, 0.824, 0.302, 0.824, -1.837, 0.093, -2.15, -1.107, 0.458, 0.615, 0.719, -0.22, 0.928, 0.406, 0.615, 1.293, -0.585, 0.771, 1.397, 0.771, -0.324, -2.046, 0.458, -1.89, -0.272, -0.063, -2.255, 0.615, 0.406, 0.51, 0.824, -1.055)
fifaRaw <- c(fifaRaw, 0.197, 0.302, 0.615, -1.837, 0.25, 0.406, 0.354, -0.168, 0.876, 0.145, 0.563, -2.255, 0.458, 0.928, 1.189, 0.667, 0.615, -2.203, 0.145, 1.137, 0.563, 0.563, -2.15, 0.093, -1.785, 0.98, -1.159, -2.307, 0.667, 0.302, 0.563, 0.719, 0.928, 0.145, 0.876, 0.458, -1.107, 0.667, 0.667, 0.197, 0.406, 0.667, 1.397, 0.458, -2.203, 0.719, 0.406, -0.481, 0.667, -0.376, 0.197, 0.51, 0.093, -2.203, 0.667, -1.42, -0.063, -1.211, -0.585, 0.25, -0.116, 0.25, 0.51, 0.667, 0.406, -1.003, 1.084, 0.719, 0.667, 0.145, 0.771, 0.51, -1.368, 0.876, -1.42, 0.145, 0.771, -0.533, 0.51, -2.359, -0.846, 1.084, -2.411, -0.846, 0.667, 0.771, 0.406, 0.458, 0.771, 0.145, 0.824, -1.942, 0.719, -2.255, -1.472, -1.211, 1.189, 0.093, -1.89, 0.667, -0.585, -0.324, 0.563, 0.145, -0.116, 0.51, -2.359, -1.159, -0.168, 0.719, 0.302, 1.867, 0.876, 0.406, 0.771, 0.041, -2.046, 0.98, 0.563, 0.093, 0.615, 0.041, -0.22, 0.615, 0.824, 0.406, 0.667, 0.719, -0.063, 0.51, 0.51, 0.041, 0.406, -1.577, 1.084, 0.302, -1.785, -2.046, -2.098, 0.458, 1.084, 0.25, -2.046, 0.406, 0.406, 1.032, 0.667, 0.354, 0.093, 0.51, -2.359, 0.667, 0.406, 0.615, 0.354, 0.197, -0.846, 0.041, 0.145, -0.742, 0.458, -2.359, -1.942, -1.316, 0.615, 0.719, 0.563, 0.51, -1.994, -2.255, 0.145, 0.563, 0.458, 0.51, 0.25, 0.719, 0.197, 0.98, 1.397, 1.032, -0.116, 0.876, 0.041, 1.293, 0.563, 0.563, -1.003, 0.145, -0.324, 0.667, -0.324, 0.51, -0.168, 0.876, -0.481, 0.615, 0.458, 0.667, -2.255, -0.168, 0.041, 0.51, 1.032, -2.098, 0.667, 0.876, -2.516, 0.354, 0.667, -2.203, 0.145, 0.928, 0.302, 0.51, 0.458, -0.481, -0.324, 0.145, 0.615, 1.137, 0.458, -0.585, 0.51, -1.159, 0.458, -0.898, 0.928, -2.307, 0.51, 0.197, -1.159, -2.203, 0.197, -0.116, 0.25, 0.458, 1.032, 0.876, 0.197, 0.145, -0.376, 0.563, -2.307, 0.354, -0.585, 0.51, 0.824, 1.241, -1.89, -0.168, 0.719, -2.203, 0.771, 1.137, 0.667, 0.98, 0.771, 0.928, -2.463, 0.302, 0.25, 1.189, -0.063, -2.046, -2.307, -0.168, 0.563, 0.719, 0.615, -1.837, 1.45, 0.093, -2.255, 0.615, -0.481, 0.98, 1.658, 0.145, -0.272, 0.98, -0.116, -1.003, 1.397, 0.615, 0.667, -0.481, 0.667, 0.615, 0.145, 0.041, 1.241, -1.89, 0.719, -0.324, 1.345, 0.041, -0.063, 0.563, -1.524, 0.041, -2.046, 0.615, 0.615, -2.307, 0.406, 0.406, -0.742, 0.615, -2.307, -0.22, -0.637, 0.615)
fifaRaw <- c(fifaRaw, 1.032, -0.116, 0.667, -1.263, -0.22, 0.98, 0.51, 0.876, -0.011, 0.51, 1.084, -1.942, -0.168, 0.197, -1.159, -2.203, -0.637, 1.293, -0.168, -0.063, 0.25, 0.719, -0.063, 1.137, 0.406, 0.458, 0.406, 0.824, -2.046, 0.041, -2.098, 1.345, -0.272, 0.145, 0.406, 0.928, -0.063, 0.563, -1.055, 0.302, 0.406, 0.145, -0.011, -2.203, 0.824, -0.481, 0.824, -0.168, 0.25, -0.011, -1.263, 0.25, -0.063, 0.406, 0.458, 0.615, -0.637, -1.733, 1.189, 0.145, -0.324, 0.615, 0.615, 0.563, 1.137, -2.098, 0.145, 0.667, 0.302, 0.824, 0.824, -0.116, 0.771, -0.22, 0.51, 0.876, 1.137, 0.51, 0.667, 0.354, 0.302, -2.046, 0.458, 1.032, 0.771, 0.824, 0.093, -2.15, 0.041, -2.255, 0.51, 0.563, 0.51, 0.824, -0.846, 0.667, -0.429, 0.406, -0.116, -0.011, 0.563, -1.211, 0.302, 0.51, 0.98, -2.307, -0.533, 0.145, 0.719, 0.667, -2.098, -2.046, 0.25, -0.272, -0.063, 0.615, -0.324, -0.22, 0.719, 0.458, 0.98, -0.116, -0.116, 0.719, 0.458, -1.89, 0.041, 1.032, 0.563, 0.458, -0.794, -1.629, -2.203, -2.203, 0.458, 0.928, -0.429, 0.667, -0.324, 0.719, 0.458, 0.354, 0.824, 1.084, 0.563, -0.168, -0.742, 0.093, 1.241, 0.719, -2.15, 0.563, -0.116, 0.51, 1.345, 0.145, -0.898, 0.902, 0.902, 0.635, -1.657, -0.164, -1.87, -0.804, 0.742, 0.582, 0.795, 0.369, -0.271, 0.902, 0.582, 1.275, -0.378, 1.915, 1.595, 1.222, -0.964, -1.55, -0.538, -1.71, -0.751, 0.635, -1.604, 1.381, 0.582, 1.275, 0.902, -1.177, -0.431, -0.271, -0.591, -1.71, 1.008, -0.218, -0.964, -1.337, 1.062, -0.697, 0.209, -1.604, 0.209, 1.275, 1.275, 0.422, 0.529, -1.764, -0.324, 1.861, -0.111, 0.582, -1.657, 0.102, -1.177, 0.529, -1.124, -1.924, 1.168, 0.902, 0.422, 0.209, 0.529, 0.049, -0.431, 0.742, -0.697, 1.222, 0.742, 0.369, 0.369, 0.155, 2.128, 0.689, -1.444, 0.955, 1.328, -0.697, -0.058, -1.017, -1.231, 0.689, -1.337, -1.817, 0.848, -0.911, -0.697, -0.911, -1.124, -0.484, -0.111, 0.689, 1.062, 1.328, 0.635, -1.071, 0.155, -0.431, -0.164, -0.431, 0.635, 0.209, -1.177, -0.378, -1.071, -0.538, 1.062, -0.591, 1.915, -1.817, -1.284, 1.115, -1.71, -1.284, 1.008, 1.275, 0.582, 0.848, 0.049, 0.742, 1.541, -1.497, 1.168, -1.71, -1.231, -0.218, 1.488, 0.902, -1.39, 0.422, -0.751, -0.644, 0.315, 0.209, -0.058, 0.049, -1.337, -0.644, -1.337, 0.209, 0.102, 1.328, 0.209, 0.315, -0.964, 0.848, -1.87, 0.689, 0.422, 0.742, 0.689, -0.164, -0.484, 0.902, 0.955, 1.062, 0.902, 0.422, -0.697, 0.582, 0.582, 0.209, -1.071, -1.231, 0.848, -0.111, -1.39, -1.71, -1.87, 0.102, 0.369, -0.484, -1.764, -0.111, 0.475, 0.582, 0.795, -0.004, -0.324, 0.315, -1.764, 0.102, -0.271, 1.168, -0.378, 0.955, -0.004, -0.644, 0.582, -0.804, 0.475, -1.817, -1.817, -0.857, 1.115, -0.697, 0.049, 0.102, -1.977, -1.764, 0.262, 1.062, -0.004, 0.742, -0.911, 0.102, 1.861, 1.755, 1.648, 0.262, -0.484, 0.582, -0.111)
fifaRaw <- c(fifaRaw, 1.915, 0.315, 1.115, -1.124, -0.697, -0.164, 1.115, -0.857, 0.102, 1.168, 1.861, -0.857, 0.529, 0.635, 0.582, -1.87, 0.102, -1.284, -0.111, 0.635, -1.604, 0.689, 1.488, -1.924, -0.697, 0.369, -1.497, 0.315, 1.435, 0.635, 0.262, 0.475, -0.591, -0.484, -0.591, 0.635, 1.488, -0.218, -0.857, 0.848, -0.697, -0.058, -1.124, 0.795, -1.87, -0.484, 0.102, -0.804, -1.924, -0.324, -0.538, -0.644, -0.644, 0.475, 0.422, 1.062, -0.378, -0.538, 0.209, -1.817, 0.422, 0.102, 0.049, 1.541, 1.328, -1.657, 0.742, 1.062, -1.39, 0.155, 1.488, 0.475, 0.848, 1.222, 1.435, -1.764, 1.541, 0.155, 1.648, -0.964, -1.817, -1.924, -0.804, 0.315, -0.218, -0.058, -1.55, 2.128, 0.049, -1.817, 1.008, -0.644, 1.488, 2.341, -0.804, -0.964, 0.795, -0.591, -0.484, 1.595, 0.848, 0.102, -0.164, 1.381, 1.381, 0.102, -0.697, 0.155, -1.87, -0.324, -0.431, 1.701, -0.591, 1.381, 0.529, -0.804, 0.209, -1.444, 0.049, 0.475, -1.87, 0.529, -0.644, -0.751, 0.155, -1.657, -0.218, -0.751, 0.635, 1.008, -0.111, 1.168, -1.337, -0.058, 0.422, -0.164, 0.848, -0.697, 0.315, 1.328, -1.604, -0.271, 0.742, -0.857, -1.977, -0.591, 1.008, -0.271, -0.538, -1.017, 0.529, -0.538, 1.168, -1.284, 1.115, -0.431, 0.369, -1.817, -0.271, -1.817, 1.115, -0.911, 0.689, 0.262, 0.742, -0.004, 0.795, -0.857, 0.529, 0.422, 1.381, -0.804, -1.444, 2.074, -1.444, 0.262, -0.218, 0.209, 0.262, -0.804, -0.804, 0.529, 1.008)
fifaRaw <- c(fifaRaw, 0.049, 1.008, -0.538, -1.444, 1.222, -0.164, -0.378, 1.222, 1.328, 0.369, 1.275, -1.817, -0.644, 0.529, 0.529, 1.062, -0.004, -0.538, -0.058, -0.911, 0.529, -0.697, 1.915, 0.529, 0.689, 1.222, 0.902, -1.497, 0.848, 1.062, 0.369, 1.008, 0.635, -1.924, -0.644, -1.657, 0.369, 0.955, -0.697, 0.155, -1.231, 1.115, -0.218, -0.644, 0.422, -0.538, 0.475, -1.071, 0.315, 0.209, 1.595, -1.55, -0.484, 0.369, 1.222, 0.689, -1.604, 0.475, -0.697, 0.848, -0.004, 1.168, -0.697, -0.857, 1.275, 0.475, 0.955, -0.591, -0.644, 0.049, 0.582, -1.444, -0.271, 1.168, 0.635, 0.529, -1.124, -1.231, -0.857, -1.817, 1.488, 1.435, -0.857, 0.369, 0.529, 1.168, 0.955, -0.111, 0.369, 1.755, 0.955, 0.155, -0.751, -0.164, 0.475, 0.422, -1.71, 0.742, -0.484, 0.848, 1.541, 0.155, -1.231, 1.237, 0.747, 0.747, -1.432, -0.015, -1.649, -0.887, 0.148, 0.257, 0.91, -0.015, -0.07, 1.292, -0.124, 1.292, -0.342, 2.381, 0.856, 1.128, -0.941, -1.595, 0.856, -1.432, -0.669, 0.311, -1.704, 1.346, 0.039, 1.455, 1.401, -0.724, -0.397, -0.451, -0.724, -1.432, 0.747, -0.124, -0.941, -1.05, 0.747, -0.941, -0.07, -1.649, -0.615, 1.401, 1.237, -0.179, 1.183, -1.486, -0.669, 1.891, -0.342, 0.856, -1.54, -0.015, -1.159, 0.039, -0.996, -1.595, -0.669, 1.618, 0.529, 0.148, 0.475, 0.257, -0.015, 0.856, -0.669, 0.529, 0.148, -0.179, 0.093, -0.233, 2.272, 0.42, -1.704, 0.91, 1.891, -0.451, 0.747, -0.07, -0.615, 0.202, -1.05, -1.595, 1.019, -0.724, -0.615, -0.887, 0.693, -0.124, -0.397, -0.288, 0.91, 1.618, 0.42, -0.724, 0.148, -0.124, 0.202, -0.778, -0.233, 0.475, -1.159, -1.105, -0.669, -0.669, 0.42, -1.05, 2.163, -1.704, -1.214, 1.019, -1.649, -0.941, -0.506, 1.237, 1.074, 0.747, -0.56, 1.401, 1.51, -1.486, 1.401, -1.323, -0.832, -0.179, 0.856, -0.015, -1.649, -0.179, -0.506, -0.288, 0.039, 0.039, 0.257, -0.342, -1.105, -0.778, -0.941, 0.747, 0.366, 1.455, 1.401, 1.237, -0.506, 0.366, -1.649, 1.128, 0.747, -0.451, 0.638, -0.451, -0.179, 0.747, 0.802, -0.288, 1.292, 1.128, -0.56, 0.693, -0.179, 0.638, -0.615, -1.323, 1.237, -0.288, -1.486, -1.595, -1.54, 0.366, 0.693, -0.179, -1.54, -0.615, 1.074, 0.475, 0.91, 0.039, -0.179, 0.965, -1.704, 0.747, 0.093, 1.074, -0.397, 1.237, -0.615, -0.342, 0.366, -0.832, 0.856, -1.758, -1.323, -0.724, 1.237, -0.615, -0.124, -0.179, -1.813, -1.758, -0.179, 1.292, -0.288, 0.311, -0.451, -0.07, 1.673, 1.292, 1.455, 1.128, -1.05, 0.148, 0.202, 1.891, 0.148, -0.669, -0.832, -0.669, -0.07, 1.128)
fifaRaw <- c(fifaRaw, -0.56, 0.91, 1.727, 1.891, -1.377, 1.401, -0.506, 1.183, -1.649, -0.451, -0.941, -0.778, 0.856, -1.432, 0.529, 1.401, -1.758, -1.05, 0.42, -1.323, 0.747, 1.564, 1.074, 0.257, 0.093, -0.451, -0.724, -0.724, 0.148, -0.124, -0.56, -0.778, 0.91, -0.124, 0.148, -0.887, 1.51, -1.649, -0.451, -0.669, -0.342, -1.54, -0.342, -0.124, -0.615, -0.615, 0.42, 0.475, 0.475, -0.397, 0.093, 0.366, -1.758, -0.124, -0.996, -0.015, 1.074, 0.584, -1.704, 0.638, 1.128, -1.649, -0.233, 1.292, 0.638, -0.288, 1.836, 0.747, -1.486, 1.618, 0.747, 1.836, -1.105, -1.595, -1.758, -0.887, -0.07, -0.288, -0.179, -1.268, 2.435, -0.342, -1.704, -0.07, -0.887, 1.727, 2.272, -0.778, -0.233, 0.257, -0.07, -0.778, 1.618, 0.965, -0.179, 0.148, 1.618, 1.727, -0.451, -0.669, -0.506, -1.268, 0.148, 0.856, 1.51, -0.397, 0.475, 0.965, -1.105, -0.179, -1.105, 1.128, 0.093, -1.595, -0.724, -0.669, -1.214, 0.856, -1.758, 1.019, -0.778, 0.91, 0.856, -0.233, 0.584, -0.778, -0.397, -0.506, -0.179, 1.074, -0.506, 0.311, 0.91, -1.214, -0.288, 0.42, -0.615, -1.649, -0.724, 0.91, 0.311, -0.015, -0.451, 1.346, -0.288, 1.51, 0.366, 1.019, -0.669, 0.747, -1.867, -0.342, -1.704, 0.747, -0.724, 1.074, 0.257, 1.128, -0.397, 1.074, -0.778, 0.093, 0.802, 1.945, -0.288, -1.649, 2.109, -1.323, 2, -0.451, -0.724, -0.179, -1.758, -0.832, -0.342, 1.836, -0.56, 1.237, -0.778, -1.54, -0.179, -0.015, -0.56, 1.346, 0.965, 0.802, 1.51, -1.268, -0.778, -0.124, 0.747, 1.346, 0.475, -0.56, -0.288, -0.832, 0.202, -0.778, 2.272, 0.802, 1.564, 1.51, 1.128, -1.595, 0.693, 0.42, -0.124, 1.51, 0.91, -1.758, -0.07, -1.432, -0.015, 0.965, -0.724, 0.91, -1.105, 1.074, 0.257, -0.179, 0.093, -0.669, -0.015, -0.56, 0.093, -0.124, 1.891)
fifaRaw <- c(fifaRaw, -1.432, -0.397, 0.148, 1.564, 0.802, -1.649, 0.91, -0.778, -0.778, 0.093, 1.292, -0.56, -0.724, 1.346, -0.506, -0.887, -0.724, -0.342, -0.124, 0.693, -1.649, -0.288, 0.529, 1.074, 0.475, -0.832, -1.105, -1.214, -1.377, 1.618, 1.455, -0.887, 0.093, -0.996, 0.91, 1.401, -0.179, 0.039, 1.836, 0.802, -0.832, -0.778, -0.56, -0.07, -0.506, -1.758, 0.638, -0.233, 1.019, 1.346, -0.015, -0.832, -0.572, 0.269, 0.01, -2.124, 0.463, -1.801, -0.572, 0.075, -0.054, 1.045, 0.398, 0.528, 0.463, 0.463, 1.239, -1.089, 1.369, 0.786, 1.045, -0.96, -2.06, 0.786, -0.507, -1.025, 0.14, -1.348, 0.01, 0.722, 1.11, 1.239, -0.119, -1.348, -0.507, 0.722, -2.06, -0.119, -0.313, -0.184, 0.334, 1.11, 0.075, 1.304, -1.93, 0.722, 0.075, 0.075, -0.507, 0.851, -1.866, 0.398, 1.821, 0.334, 0.398, -1.995, 0.592, -1.93, 0.851, -0.119, -1.219, -0.119, 0.722, 0.98, 0.14, 0.528, 0.14, 0.592, -0.184, -1.477, 0.592, 0.592, -0.96, -0.895, 0.075, 2.598, 0.528, -1.283, 1.11, 0.204, 0.98, -0.637, 0.269, -0.443, 0.657, -1.283, -1.995, 0.075, -1.736, -0.507, -1.801, 0.722, -0.054, 0.528, 0.075, 1.045, 0.786, 0.657, -1.736, 0.851, 0.204, -0.96, -0.313, 1.304, 0.722, -1.866, 0.14, -0.507, -1.542, 0.592, -0.119, 1.692, -1.413, -0.637, 1.239, -2.124, -0.701, 0.592, 0.786, -0.766, -0.507, -1.219, 0.786, 0.851, -0.184, 1.11, -1.995, -1.801, -0.572, 1.433, 1.304, -1.154, 0.14, -0.119, 0.01, 0.14, -0.119, 0.334, -0.054, -2.642, 0.334, -0.054, 0.851, 0.851, 1.369, 1.304, -0.054, 0.463, 1.304, -1.089, 0.463, 0.463, -0.766, -0.119, 0.204, -0.184, 1.175, 0.398, 0.075, 1.498, 0.657, -0.184, 0.916, -0.249, 0.463, 0.98, -0.184, 0.204, -1.542, -0.766, -1.866, -1.154, 0.592, 0.528, -1.283, -1.283, 0.01, 0.657, -0.443, 0.786, -0.054, 0.528, -0.119, -1.866, -0.119, 0.204, 0.463, -1.219, 1.045, -0.249, 0.398, 0.592, 0.075, 1.045, -1.607, -2.189, -1.154, 1.239, -0.766, -0.313, 1.498, -0.96, 0.463, 0.722, 0.398, 0.722, -0.119, 0.14, 0.269, 1.433, 0.916, 1.433, 0.657, 0.592, -0.313, 0.592, 1.11, 0.204, -0.054, -0.119, -0.119, -0.249, 1.045, -0.054, 0.463, 0.463, 0.916, -0.054, 0.592, 0.398, 0.14, -1.93, 0.269, 0.269, -0.766, 1.239, -1.93, 0.463, 1.11, -1.995, 0.722, 0.269, -2.254, 0.786, 1.11, 1.11, 0.916, 0.851, -0.119, -1.348, -1.477, -0.443, -0.766, 1.11, -0.119, 0.786, 0.01, 0.075, -0.054, 0.722, -1.607, 0.398, 0.01, -0.637, -1.801, 0.01, 0.851, 0.14, 0.98, 0.786, 1.239, 0.786, -1.672, 0.334, -0.507, -1.801, 0.528, -0.119, -0.313, 0.722, 0.398, -1.219, -0.378, 0.851, -1.089, -1.089, 0.657, 0.657, 0.398, 0.98, -0.378, -1.672, 1.498)
fifaRaw <- c(fifaRaw, 1.175, 1.369, -1.219, -0.96, -2.254, -0.831, 1.369, 0.98, -0.119, -1.995, 1.692, -0.184, -2.383, 1.11, -1.154, 0.916, 1.757, 0.269, -0.054, -0.637, -0.766, 0.916, 1.11, 0.528, 0.204, -1.348, 0.657, 1.304, -0.507, -1.025, 0.463, -0.443, -0.766, 0.528, 1.045, 0.14, 1.821, -0.831, -1.283, -0.766, -1.801, 0.204, -0.184, -2.383, 0.398, -0.443, -1.154, 0.786, -1.995, 0.463, 0.98, 0.916, 0.851, 0.398, 0.722, -1.477, 1.045, 0.98, 0.398, 1.304, -0.184, 0.851, 1.433, -1.283, 0.204, 0.592, -0.054, -1.477, -1.025, 0.98, 0.657, -0.184, 0.14, 1.304, 0.14, 1.175, 1.045, 0.916, -1.283, 0.98, -1.93, -1.283, -2.512, 0.851, -0.507, 0.528, 0.98, -0.054, -0.701, -0.378, -0.313, 0.334, 0.528, 0.463, 0.075, -1.607, 1.175, -0.507, 1.045, 0.01, 0.916, 0.01, -2.124, 0.463, 0.398, 0.592, 0.01, 1.627, -0.766, -1.93, 0.334, 0.528, 0.592, 0.334, 0.916, 0.592, 1.369, -2.06, -0.054, 0.98, 0.334, 1.304, -0.378, -0.054, 0.98, -1.995, 0.398, -0.831, 1.627, 0.851, 0.269, 0.592, 0.075, -2.383, 0.204, 0.657, 0.528, 1.239, 0.592, -2.448, 0.851, -1.413, -0.895, 1.433, -1.154, 1.498, -0.443, 1.045, 0.657, 0.592, 0.657, 0.98, 1.11, -0.313, 0.075, -0.572, 0.592, -2.642, 0.075, 0.14, 0.269, 0.269, -1.542, -1.477, -1.348, 0.14, 0.528, 0.204, -0.119, -1.348, 0.334, 0.592, -0.96, 0.398, 0.14, 0.463, 0.722, -1.801, -0.96, 0.398, -0.184, 0.398, 0.14, -0.637, -2.706, -1.866, 1.563, 0.98, 0.204, -0.249, 0.334, 1.045, -0.313, 0.463, 0.334, 0.722, 0.528, -0.054, -0.831, -0.701, -0.443, -0.313, -1.607, 1.175, 0.14, 0.851, 1.433, -0.119, -1.542, 0.613, 0.213, 0.556, -2.13, -0.358, -2.073, -0.53, 0.556, 0.156, 0.842, -0.358, 0.842, 0.442, 0.385, 1.071, -0.072, 0.899, 1.471, 0.899, -0.53, -2.302, 0.556, -0.987, -0.873, 0.042, -2.245, 0.385, 0.613, 0.613, 0.842, -0.701, -0.53, 0.213, 0.671, -1.616, 0.213, 0.213, 0.156, -0.015, 0.956, 0.27, 0.556, -2.073, 0.499, 0.899, 0.956, 0.099, 0.671, -1.902, 0.213, 1.242, 0.556, 0.385, -2.187, 0.27, -2.302, 1.128, -0.816, -1.502, 0.556, 0.556, 0.671, 0.728, 0.671, 0.442, 0.842, 0.042, -0.93, 0.213, 0.671, 0.099, -0.015, 0.385, 1.871, 0.385, -2.473, 0.842, 0.442, 0.213, 0.956, 0.213, 0.328, 0.442, 0.213, -2.016, 0.613, -1.387, -0.13, -1.673, 0.442, 0.385, 0.442, 0.213, 0.671, 0.613, 1.128, -0.987, 0.956, -0.015, 0.385, 0.385, 0.728, 0.27, -1.159, 0.213, -1.502, -0.987, 0.556, -0.873, 0.956, -2.473, -0.473, 1.242, -2.302, -0.244, 0.499, 0.671, 0.042, 0.442, 0.499, 0.27, 0.613, -1.101, 0.671, -1.844, -1.444, -0.587, 1.071, 0.556, -2.187, 0.156, -0.473, -0.187, 0.27, 0.556, -0.015, 0.156, -2.187, -1.273, -0.701, 0.842, 0.556, 1.757, 0.899, 0.042, 0.613, 0.785, -2.302, 0.842, 0.728, -0.015, 0.671, -0.187, 0.27, 0.956, 0.499, 0.328, 0.842, 0.728, -0.015, 0.613, 0.556, 0.042, 0.328, -2.016, 0.842, 0.27)
fifaRaw <- c(fifaRaw, -1.73, -1.959, -1.844, 0.385, 0.499, -0.987, -1.559, 0.042, 0.385, 0.842, 0.499, 0.099, 0.156, 0.499, -2.416, 0.499, 0.442, 0.556, 0.27, 0.499, -1.044, -0.072, 0.042, -0.644, 0.728, -2.645, -2.359, -1.387, 0.613, 0.842, 0.213, 0.499, -2.073, -2.645, 0.156, 0.499, 0.042, 0.556, -0.015, 0.613, 0.899, 0.956, 1.357, 0.556, 0.042, 0.556, 0.156, 1.299, 0.213, 0.499, -0.301, 0.156, -0.187, 0.671, -0.187, 0.328, 0.613, 0.842, -0.072, 0.213, 0.27, 0.442, -2.245, 0.156, 0.385, 0.328, 1.185, -2.302, 0.442, 1.014, -2.302, 0.213, 0.499, -1.73, 0.613, 0.899, 0.27, 0.613, 0.328, -0.301, -0.415, -0.873, 0.328, 0.842, 0.613, -0.587, 0.556, -0.301, 0.099, -0.415, 0.785, -2.416, 0.27, -0.015, -0.987, -2.187, -0.072, 0.328, 0.385, 0.613, 0.956, 1.357, 0.328, 0.27, -0.072, 0.728, -1.844, 0.499, -0.587, 0.27, 0.785, 1.071, -1.844, -0.072, 0.899, -2.245, 0.899, 1.014, 0.842, 0.956, 0.785, 1.071, -2.588, 0.671, 0.785, 1.128, -0.072, -2.302, -2.416, -0.244, 0.785, 0.956, 0.156, -2.13, 1.528, 0.099, -2.702, 0.785, -0.301, 0.785, 1.871, 0.27, 0.042, 1.014, -0.015, -0.015, 1.299, 0.671, 0.499, -0.415, 0.385, 0.671, -0.072, 0.27, 0.785, -1.844, 0.613, 0.042, 1.357, -0.13, 0.613, 0.499, -1.559, 0.042, -1.844, 0.328, 0.328, -2.13, 0.213, 0.156, -0.53, 0.613, -2.588, -0.015, -0.415, 0.613, 0.728, 0.099, 0.728, -1.273, 0.442, 0.899, 0.499, 0.956, -0.13, 0.556, 1.128, -2.13, -0.13, 0.328, -0.244, -1.559, -0.987, 1.185, 0.328, 0.042, 0.213, 0.671, -0.13, 1.128, 0.499, 0.442, 0.728, 0.671, -1.959, 0.213, -2.359, 0.956, -0.587, 0.156, 0.613, 0.728, -0.415, 0.556, -0.758, 0.213, 0.156, 0.099, -0.015, -1.33, 0.842, -0.473, 0.728, -0.244, 0.671, -0.13, -1.33, 0.328, 0.27, 0.442, 0.728, 0.899, -0.93, -2.588, 1.185, 0.042, 0.156, 0.442, 0.613, 0.27, 1.014, -1.959, 0.156, 0.671, -0.415, 0.956, 0.442, 0.042, 0.728, -0.301, 0.499, 0.728, 1.185, 0.556, 0.385, 0.442, 0.613, -2.416, 0.156, 0.728, 0.842, 0.899, 0.099, -2.073, 0.099, -2.073, -0.301, 0.785, 0.385, 1.014, -0.13, 0.842, 0.156, -0.13, 0.099, 0.042, 0.671, -1.101, 0.156, 0.156, 1.014, -2.13, 0.213, -0.187, 0.671, 0.613, -2.187, -2.245, 0.099, 0.042, 0.042, 0.671, -0.244, -0.187, 0.442, 0.842, 0.842, -0.13, -0.644, 0.613, 0.556, -2.416, -0.13, 1.071, 0.499, 0.442, -0.873, -2.53, -2.416, -2.53, 1.528, 1.014, -0.072, 0.613, 0.099, 0.842, 0.099, 0.213, 0.613, 1.014, 0.27, -0.187, -0.873, -0.187, 1.071, 0.442, -2.13, 0.842, -0.072, 0.556, 1.471, 0.213, -1.73, 1.784, 0.282, 1, -2.003, -0.044, -1.481, -1.22, 0.152, 1.066, 0.543, 0.413, -0.044, 0.021, 0.478, 0.87, -0.044, -1.872, 1.523, 0.543, 0.739, -1.415, -0.436, 0.021, -0.044, 0.347, -1.611, 1.066, 0.413, -0.044, 0.739, -0.762, 0.282, 0.543, -0.24, -1.546, 0.478, 1, 0.347, -0.371, -0.175, 0.347, 0.282, -2.591, 0.347, 1.262, 1.653)
fifaRaw <- c(fifaRaw, 0.021, 0.086, -2.199, -0.11, 0.217, 1.719, 0.282, -0.893, -1.024, -1.481, 1.392, -0.11, -1.742, 0.87, 0.478, 0.086, 1.196, 0.152, 0.87, 0.217, 0.347, -0.175, 0.674, 0.805, 1.066, 0.543, 1.066, 0.021, 0.543, -1.481, 0.674, 0.805, -0.697, 0.217, -0.762, -0.24, 1.066, 0.413, -1.938, 0.739, -0.305, 0.674, -0.371, 0.282, -0.632, -0.958, 1.719, -0.697, 1.196, -2.134, 0.805, 1.719, 0.674, 1.392, 0.282, 0.152, 0.543, 0.282, 0.609, -0.762, -0.24, 0.805, 0.282, -1.481, -2.656, -0.632, 0.152, -2.656, -0.632, 1.327, 0.152, 0.543, 0.674, 0.739, -0.175, 0.021, -1.22, 0.739, -1.938, -0.11, 0.086, 0.87, -1.285, -0.175, 1.327, 0.021, 0.805, 1.066, -1.154, 0.282, 0.347, -1.22, 0.021, -0.11, 0.478, 0.413, 1.914, 0.543, 0.935, 0.282, 0.413, -1.415, 0.543, -0.371, -1.024, -0.501, 0.674, -1.677, -1.089, 1.066, 0.805, -1.154, 0.282, 0.152, 0.347, 0.805, 0.217, 0.413, -0.044, 1.327, -1.481, -1.024, -1.415, -1.742, 0.413, 1.98, 0.413, -2.134, 0.739, 0.152, 1.523, 0.739, 0.282, 0.282, 0.805, -3.113, 0.609, 0.347, 1.131, 1.327, 0.152, -0.044, 0.086, 0.217, -1.872, 0.87, -2.199, -1.22, 0.021, 0.086, 1, 0.478, -0.24, -1.807, -0.762, 0.282, 0.282, 0.805, -0.567, 0.282, 0.805, -1.611, 1.262, 1.523, 1.066, -1.611, 0.674, -0.828, 0.805, 0.543, 1.196, -1.089, 0.347, 0.152, 0.543, 0.152, 0.478, -1.481, 1.131, -1.154, 0.282, 1.327, 1.196, -2.068, 0.413, -0.567, 1.131, 0.217, -1.481, 1.327, 0.217, -2.721, 0.086, 0.152, -1.677, 1.066, 0.282, -2.068, -0.044, 0.674, -0.567, 0.413, 0.217, 0.609, 1.327, -0.371, -0.567, 0.739, -2.068, 0.217, 0.086, 0.282, -2.003, 0.543, 0.478, -1.546, -2.525, 0.086, 0.413, 0.609, 0.021, 1.262, -0.762, 1.588, -1.35, 0.347, 0.152, -0.632, 0.805, -1.22, 0.478, 1, 0.87, -2.656, 0.674, -0.893, -0.632, 0.935, 0.674, -0.893, 0.674, -0.044, -1.481, -2.591, -0.697, -0.958, 0.674, 1.262, -2.068, -2.591, 0.217, -0.828, 0.152, 0.282, -1.154, 0.87, 1.196, -2.46, -0.24, -0.893, 1, 0.739, 0.413, 0.282, 0.805, 0.021, 0.021, 1.066, 0.347, 0.413, -0.567, 0.674, 0.739, 0.87, 0.478, 1.784, -0.697, 0.086, -1.024, 0.935, 0.021, 0.478, -0.305, -0.501, -1.415, -1.807, 0.478, 0.282, -2.525, 1.066, 0.805, 0.935, 0.674, -2.003, -0.893, 0.282, -1.35, 1.523, -0.762, 1.066, -0.436, 0.805, 0.347, 0.935, 0.347, -0.11, 0.805, -0.305, -1.677, 0.021, 0.347, -0.24, -1.285, -0.371, 1.196, -0.24, 0.478, -0.371, 0.021, -0.11, -0.697, 0.021, 0.086, 0.021, 0.347, -1.807, -0.305, -2.134, 0.739, 0.674, -0.371, 0.739, 1.196, 1.262, 0.739, -0.632, 0.021, 0.674, -0.436, -0.11, -2.329, -1.024, -2.068, 1.653, -0.11, -0.24, 0.478, -1.938, 0.347, -0.697, 0.87, 0.674, 0.086, 0.543, -1.807, 1.719, 0.152, -1.154, 1.066, -0.371, 0.086, 1.327, -1.546, 0.609, 0.87, 0.152, 1, 1.131, -0.24, 0.935, -1.415, 1.131)
fifaRaw <- c(fifaRaw, 0.543, 0.609, 0.021, 0.935, 0.021, -0.632, -2.656, 0.478, 0.413, 0.609, 0.347, -0.305, -2.656, -0.893, -2.068, 0.478, -0.436, -0.501, 0.217, 0.543, 1.327, 0.217, 0.674, 0.217, 0.021, -0.501, 0.021, 0.282, 0.739, 0.87, -0.436, 0.152, -0.11, 0.935, -0.175, -0.958, -1.546, -0.24, 1.457, -0.11, 0.152, -0.501, 0.347, 0.674, 0.543, 0.217, 0.347, 0.086, 0.609, -0.24, -1.742, 0.021, 0.347, 0.935, 0.609, 0.347, -1.089, -1.024, -0.567, 0.805, 0.935, -0.567, 0.543, -1.154, -0.11, 1.653, 0.609, 0.217, 0.805, 0.413, 0.282, -0.632, 0.935, 1.653, 1.457, -1.938, -0.11, 0.086, 0.086, 0.674, 0.543, -1.154, 1.292, 0.086, 0.823, -2.057, -0.048, -1.789, -1.186, 0.019, 1.627, 0.421, 0.153, 0.287, -0.048, 0.555, 0.555, 0.019, -2.057, 0.823, 0.555, 0.823, -1.119, 0.22, -1.588, -0.45, 0.488, -2.325, 1.091, 0.488, 0.019, 0.622, -0.584, -0.249, 0.756, 0.086, -1.588, 0.287, 1.493, 0.354, 0.019, -0.249, 0.622, 0.153, -2.057, 0.756, 1.493, 1.56, 0.555, 0.086, -2.66, -0.316, -0.182, 1.895, 0.22, -1.454, -0.784, -1.32, 1.359, -0.784, -1.655, 0.756, 0.488, 0.287, 1.091, -0.249, 0.756, 0.287, 0.622, -0.584, 0.019, 0.756, 0.823, 0.019, 0.354, -0.115, 0.153, -0.985, 0.354, 0.756, -0.918, 0.354, -0.918, 0.019, 0.756, 0.354, -1.588, 0.354, 0.019, 0.421, -0.048, 0.086, -0.182, 0.22, 1.895, -0.383, 1.024, -2.124, 0.153, 1.627, 0.689, 1.56, 0.488, 0.287, 0.823, 0.555, 0.89, -0.784, 0.22, 0.89, 0.89, -0.249, -2.459, -0.048, -0.584, -2.861, -0.383, 1.426, 0.22, 0.689, 0.823, 0.89, -0.048, 0.153, -1.521, 0.957, -1.789, -0.115, 0.555, 0.153, -1.119, -0.249, 1.091, 0.354, 1.024, 1.158, -2.124, -0.182, 0.153, -1.655, -0.249, -1.119, 0.555, 0.488, 1.694, 0.622, 0.287, 1.024, 0.555, -1.387, 0.89, -0.249, -1.253, 0.421, 0.488, -1.588, -0.918, 0.957, 0.89, -0.985, 0.22, 0.555, 0.555, 1.292, 0.756, 0.287, -0.517, 1.158, -0.851, -0.985, -0.918, -1.387, -0.182, 1.962, 0.488, -1.454, 0.756, 0.287, 1.359, 0.823, 0.153, 0.086, -0.115, -2.794, 0.421, 0.823, 0.823, 1.091, -0.048, -0.182, 0.153, 0.354, -0.784, 0.555, -2.593, -1.253, -0.115, -0.45, 0.823, 0.622, 0.287, -3.129, -0.918, -0.115, 0.555, 0.756, -0.182, 0.019, 1.761, -2.325, 1.158, 1.292, 0.756, -1.655, 0.354, -1.32, 0.957, 0.823, 0.89, -0.851, 0.421, 0.22, 0.153, 0.019, 0.22, -1.789, 1.225, -1.387, 0.488, 1.493, 1.225, -2.258, 0.756, -0.45, 1.56, 0.22, -0.985, 0.957, 0.153, -2.459, 0.153, 0.22, -1.856, 0.823, 0.354, -0.651, 0.22, 0.488, -0.048, 0.421, -0.784, 0.957, 1.225, -0.249, -0.249, 0.689, -2.124, 0.823, 0.622, 0.153, -2.526, -0.115, 0.354, -1.119, -2.258, 0.421, 0.488, 0.555, 0.153, 1.426, -0.784, 1.024, -1.052, 0.421, 0.689, -1.387, 0.689, -0.718, 0.22, 0.622, 0.823, -2.526, 0.622, 0.019, -0.316, 1.158, 0.823, -0.918, 0.823, 0.22, -1.32, -2.459, -0.584, -0.985, 0.689, 1.359, -0.383, -2.727, 0.354, -0.048, 0.354, 0.086, -1.052, 0.823, 1.56, -3.263, 0.421, -0.182, 1.024, 0.555, 0.287, 0.354, 0.622, 0.153, -0.115, 0.22, 0.421, 0.488, -0.918, 1.091, -0.182, 1.158, -0.584, 1.694, -1.052, 0.22, -1.588, 0.756, 0.019, 0.689, -0.115, -0.517, -0.784, -1.789, -0.182, 0.488, -1.722, 1.024, 0.622, 1.091, 0.622, -2.258, -1.253, 0.622, -0.651, 1.426, -1.387, 0.823, -0.249, 0.555, 0.555, 0.689, 0.354, 0.019, 1.091, -0.115, -1.454, -0.249, 0.287, -0.115, -1.655, -0.584, 1.426, -0.45, 0.019, -0.048, 0.287, 0.354, -0.718, 0.153, -0.249, 0.086, 0.22, -0.784, -0.784, -2.392, 0.555, 0.421, -0.651, 0.756, 1.225, 1.426, 0.689, -1.32, 0.756, 0.086, -0.718, -0.784, -1.856, -1.99, -1.186, 1.761, 0.689, -0.316, 0.555, -2.191, 0.957, -0.048, 0.957, 0.89, -0.383, 0.354, -1.856, 0.957, 0.287, -1.454, 0.823, -0.851)
fifaRaw <- c(fifaRaw, 0.153, 0.89, -1.119, 0.488, 0.823, -0.182, 1.024, 0.488, -0.048, 1.024, -1.588, 1.426, 0.555, 0.622, -0.182, 0.89, -0.45, -0.249, -2.861, 0.488, 0.622, 1.024, -0.651, 0.488, -2.526, -0.182, -2.191, 0.354, -0.651, -0.115, 0.287, 0.421, 0.823, 0.488, 0.622, -0.115, -0.182, -0.651, -0.316, 0.354, 0.287, 1.024, -0.651, 0.421, -0.584, 0.689, -0.249, -0.182, -1.253, -0.383, 1.225, -0.115, 0.354, -0.115, 0.622, 1.024, 0.153, 0.421, 0.22, -0.316, 0.622, -0.517, -1.32, -0.048, 0.89, 1.091, 0.287, 0.22, -0.918, -1.186, -0.918, 0.756, 0.756, -0.115, 0.89, -0.584, -0.651, 1.627, 0.756, 0.019, 0.354, 0.22, -0.249, -0.584, 1.225, 1.627, 1.627, -1.99, -0.383, 0.22, -0.316, 0.89, 0.689, -0.918, 1.582, 0.508, 1.247, -2.515, -0.298, -1.709, -0.366, 1.045, 0.306, -0.097, -0.567, 0.441, 0.575, 0.172, 0.776, 0.373, 0.037, 1.65, 0.978, 0.709, -0.97, 0.105, -0.164, -0.164, 0.239, -1.373, 0.642, 0.441, 1.247, 0.911, -1.507, 0.306, 0.239, 0.373, -1.507, 0.172, 0.306, 0.306, -0.836, 0.105, 0.239, -0.164, -2.717, -0.701, 0.844, 1.717, -0.164, -0.231, -1.843, 0.844, 0.776, 1.314, 0.172, -0.97, -0.836, -2.179, 1.65, -0.231, -1.642, 0.844, 0.508, -0.097, 1.381, 0.105, 0.306, 0.037, 1.045, -1.104, 0.373, 0.441, 1.112, 0.709, 0.709, 0.508, 1.045, -3.254, 1.045, 0.642, 0.172, -0.903, -1.642, -0.366, 1.515, -0.164, -1.172, 0.642, -0.298, 0.642, -1.44, 0.037, 0.037, -0.231, 1.784, 0.239, 1.448, -1.104, -1.037, 2.053, 0.709, 1.582, 0.172, -0.164, 0.508, -0.903, 0.508, -2.112, 0.172, 1.045, 0.306, 0.441, -2.112, -0.433, 0.978, -2.045, -1.44, 0.844, 0.239, 0.844, 0.642, 0.306, 0.776, 0.441, -0.97, 1.247, -1.239, -1.037, -0.634, 1.515, -0.567, -1.507, 1.045, -0.97, 0.105, 1.314, -1.642, 0.239, 0.508, -0.164, -1.239, -0.567, 0.575, -2.246, 1.784, 0.575, 0.373, -0.03, 0.911, -0.97, 0.776, 0.575, -0.298, 0.844, 0.306, -0.433, -0.634, 1.247, 0.709, -0.366, 0.239, -0.164, -0.164, 0.373, 0.373, 0.105, -0.701, 0.978, -0.97, -1.507, -1.575, -2.112, -0.231, 1.784, -0.298, -0.366, 0.239, 0.172, -0.164, 0.776, -0.5, 0.642, 0.508, -2.246, 0.239, 0.037, 1.784, 0.776, 0.105, -0.164, 0.105, -0.097, -0.231, 0.373, -2.582, -1.978, -0.903, 0.709, 0.978, 0.508, -0.097, -1.373, -0.769, 0.105, 0.508, 0.239, 0.911, 0.105, 0.844, -1.172, 0.844, 1.381, 0.776, -1.642, 0.642, 0.239, 0.508, 0.844, 1.247, -0.903, 0.575, -0.298, 0.844, -0.03, 0.911, -2.112, 0.306, -1.642, 1.179, 1.448, 1.851, -2.717, -0.366, -1.44, 0.776, 0.373, -1.709, 0.575, 0.373, -2.582, -0.03, 0.575, -1.373, 1.112, -0.03, 0.844, -0.231, 0.373, -0.634, 0.373, -0.164, 0.306, 1.045, 0.441, 0.373, 0.709, -0.836, 0.037, -1.911, 0.373, -2.045, -0.231, 1.717, -1.776, -1.978, 0.575, 0.037, 0.642, 0.978, 1.045, 0.239, 0.373, -0.433, -0.433, -0.097, -1.843)
fifaRaw <- c(fifaRaw, 0.441, -0.433, 0.776, 1.247, 1.045, -2.381, 1.179, -0.366, -0.366, 0.844, 0.978, 0.373, 0.037, 0.441, -0.164, -2.851, 0.508, 0.105, 0.978, 0.441, -0.03, -2.314, -0.836, -0.298, -0.231, -0.836, -1.978, 0.911, 0.844, -1.575, 0.037, -0.097, 1.314, 1.112, 0.037, 0.239, 0.978, -0.97, -1.306, 1.515, 0.575, 0.642, 0.306, 1.112, -0.298, 0.373, 0.508, 1.381, -1.172, 0.441, -1.507, 1.515, 0.844, 0.306, -0.03, -1.172, -1.978, -2.314, -0.366, 0.172, -1.911, 0.037, 0.441, 0.911, 1.247, -1.642, -1.642, 0.373, -0.701, 0.978, 0.105, 0.978, -0.836, 0.776, 0.508, 0.642, 0.239, -0.567, 1.381, 0.642, -1.575, 0.306, 0.508, -0.5, -1.104, -0.5, 1.112, -0.097, 0.575, 0.239, 0.306, 0.239, -0.5, 0.172, 0.373, -0.231, 0.239, -1.172, -0.903, -1.978, 1.381, -0.298, 0.105, 0.239, 1.045, 1.448, 1.247, -0.5, 0.441, 0.642, -0.5, -0.298, -1.911, -0.836, -2.045, 1.247, -0.097, -0.5, 0.441, -2.448, -0.567, -0.567, 1.112, 0.441, 0.105, 0.306, -1.575, 1.851, -0.433, -0.298, 0.911, 0.373, 0.373, 0.978, 0.172, 0.172, 0.709, -0.03, 1.247, 2.053, -0.634, 0.373, -0.903, 0.441, 0.105, 0.709, 0.508, 1.314, 0.844, -0.097, -1.709, 0.575, 0.911, 0.776, 0.642, 0.373, -2.515, -0.298, -1.776, -0.231, 0.844, -0.231, -0.03, -1.239, 1.112, 0.172, 0.575, 0.575, -0.366, -0.836, -0.903, -0.164, 0.978, 0.978, 0.105, -1.037, 0.105, 1.112, 0.105, -0.298, -1.709, -0.366, 0.776, -0.231, 0.037, -0.567, -0.836, 0.642, 0.776, 0.239, -1.172, -0.164, 0.441, 0.306, -1.978, -0.5, 0.172, 0.776, 0.978, -1.44, -0.231, -0.903, -1.575, 1.045, 1.381, -1.239, 0.239, -0.567, 0.642, 0.844, -0.164, 0.441, 0.575, 0.105, -0.298, -0.567, 0.306, 1.784, 1.247, -1.642, 0.105, -0.164, 0.709, 1.179, -0.03, -1.776, 0.662, -0.582, 0.21, -0.921, -1.034, -0.017, -0.356, -1.599, -0.017, 0.21, -1.938, 0.662, -0.356, 0.323, 1.34, -1.034, 1.453, 2.018, 0.662, -0.921, -0.921, 0.662, 0.323, -1.599, -0.582, -0.243, -0.13, -0.243, 1.114, 1.453, -0.695, -0.243, -0.356, -2.164, 1.905, 1.114, -0.695, 1.34, 0.323, 1.227, 1.227, 0.097, 0.323, -0.017, 1.34, 0.775, 0.662, -0.017, 1.114, -1.034, 1.34, 0.436, -0.921, 1.114, 0.436, 1.001, 0.549, -0.243, -0.582, -0.017, -0.243, 1.453, 0.323, 1.001, -0.243, 0.21, -0.808, -0.808, -1.712, -0.13, -0.582, -0.243, -0.017, 3.035, 0.097, 0.888, -0.243, 0.662, 0.323, 0.662, 0.436, -1.26, 0.549, -0.356, 1.114, 0.436, -1.373, -1.034, -1.486, 1.453, 0.775, -0.582, -0.017, -0.469, -0.582, 1.679, -1.486, 0.323, -0.017, -0.13, -0.13, 0.097, 0.323, -2.164, -0.356, -2.051, -0.808, -1.486, 0.549, 0.436, -2.051, 0.21, 1.679, -2.051, -0.469, 1.453, 0.097, -3.068, -0.13, -0.017, 0.549, 0.888, 0.436, 0.662, 0.097, -2.39, 1.227, 1.679, 0.662, -1.938, -0.017, -1.034, -0.582, 0.21, -0.017, -0.469, -2.503, 0.888, -0.808, 0.323, 0.436, -0.469, 1.34, 0.775, -1.938)
fifaRaw <- c(fifaRaw, 0.775, 0.662, 1.453, 1.001, -0.808, -0.582, 0.21, -0.695, 0.097, 1.453, 0.888, 0.436, 0.662, -0.017, 0.21, 0.436, 0.549, -0.582, 0.21, 0.549, 0.21, -0.808, 0.323, 0.549, 0.775, -0.808, 1.114, -0.243, -0.582, -0.469, 0.097, -0.017, 0.21, -0.582, 0.097, -0.469, -2.277, -0.582, -0.017, -0.921, -0.808, 0.549, -0.469, -0.695, -1.034, 0.436, 1.114, -1.599, -1.034, -1.26, 0.549, 0.436, -0.017, 1.566, -1.147, -0.017, -1.147, 0.323, 0.097, 0.21, -0.921, 0.323, 1.34, 0.323, 0.549, -0.243, 0.436, 0.549, 0.097, 2.018, -0.469, 0.775, -0.017, -0.695, -1.034, 0.549, -1.599, -1.373, 1.227, 1.114, 0.097, -0.13, -0.13, -0.356, -1.825, 0.21, -0.017, -0.243, 1.114, 0.662, 0.323, 0.888, -1.147, 1.114, -1.034, 1.001, -0.13, 0.549, -0.017, 0.549, -1.034, 0.549, -0.695, -0.921, -0.695, 1.227, 0.662, -0.582, 0.097, 0.323, -0.921, 0.549, 0.323, -3.633, -0.356, -2.051, -1.373, -1.599, -0.582, -0.921, 0.21, 0.323, 0.21, 2.583, 0.21, -1.825, 1.566, 1.114, 0.775, -1.486, -1.373, -0.921, 0.21, 1.792, -1.034, -0.017, 1.792, 0.549, 1.114, 1.114, 0.097, 1.679, 1.453, 2.357, -0.695, 0.549, 1.453, 1.114, -1.938, 0.097, -1.147, -1.034, 0.775, 0.549, -0.469, -0.921, 1.905, -0.469, 0.097, -0.582, -0.13, 0.775, 1.453, -0.243, 0.436, 0.21, -0.469, 0.21, 1.566, -0.243, -2.051, -0.695, 0.775, 0.888, -0.469, -0.469, -0.017, 0.775, -0.582, 0.888, 2.131, -0.695, 1.453, 0.775, -1.373, -0.356, -0.017, -0.582, -1.938, 0.549, -0.695, 0.323, -0.017, 0.775, -2.503, -0.695, 0.888, -0.243, -0.921, -0.017, 0.21, -1.26, 0.097, 0.662, 0.21, 1.114, -1.147, 0.436, 0.662, 0.21, -1.486, -0.469, 0.436, 0.323, -1.712, 1.566, 0.662, -1.486, 0.775, 0.21, -0.469, 0.888, 1.566, 0.097, 0.323, -0.243, -0.017, -1.486, 0.323, -1.034, -1.26, -0.582, 1.566, 0.549, -1.825, 0.21, -0.469, 0.549, -0.243, -0.017, -0.695, 0.21, -0.13, -0.243, -0.017, -1.825, 0.662, 0.097, -0.921, 1.227, 0.662, 0.323, 0.775, 1.34, -2.503, -1.825, 1.566, -0.017, -0.582, -0.13, 0.21, -0.582, 1.34, -0.469, -0.243, 0.549, -0.695, 0.662, 0.775, 0.888, 0.775, -1.373, 0.549, 0.323, 2.018, -0.13, -0.469, 0.21, 0.662, -0.017, 0.323, 0.662, 0.323, 0.323, -0.695, -2.503, 0.097, 1.227, -1.712, 0.21, -0.243, 1.453, 0.097, 0.888, 1.227, 1.001, -0.356, -0.695, 1.001, -0.808, 0.097, -0.582, 1.34, 0.21, -0.921, 0.21, 0.775, 1.453, 0.549, 0.436, -1.938, -0.921, -0.243, 1.227, -1.147, -0.582, 0.323, 0.323, 1.114, -0.808, -0.582, 0.21, 0.21, -0.13, -0.921, 1.227, -0.921, -0.017, -1.26, -0.017, -0.017, -0.808, 2.583, 0.775, -1.486, 1.227, -0.582, 1.227, -1.938, -0.695, 1.453, 0.888, 0.436, -0.582, -2.051, -2.277, 1.114, -0.356, -0.356, 2.018, -1.712, -0.243, 0.888, -0.469, -1.147, 1.737, 0.969, 1.249, -0.705, 0.341, -1.403, -0.008, 0.69, -0.426, 0.969, 1.109, 0.551, 0.969, -0.008, 1.109, 0.621, -0.217)
fifaRaw <- c(fifaRaw, 1.667, 0.411, 1.179, -1.333, 0.621, -0.077, 0.132, -0.356, -1.264, 0.551, -0.356, -0.077, 1.039, -1.822, -0.775, 0.062, -1.473, -2.031, 0.83, 0.9, 0.272, -0.636, 0.411, -0.775, -0.147, -1.194, 0.132, 0.9, 1.458, -0.915, 0.132, -2.31, -0.008, 0.551, 0.341, 0.551, -1.543, -0.845, -0.705, 1.877, -0.217, -2.45, 1.179, 0.83, 0.76, 0.83, 0.411, 1.039, -0.636, -0.217, 0.76, 0.76, 0.83, 1.388, 1.109, 0.9, 0.551, 0.9, -1.403, 1.249, -0.147, -0.147, -0.217, -1.264, -0.636, 1.109, -1.822, -1.264, 0.411, -0.008, 1.388, -2.101, 0.411, -0.147, 0.272, 0.969, 0.272, 1.179, -2.45, -0.077, 2.156, 0.132, 1.039, -0.566, 0.621, 0.83, 0.062, 0.062, -2.171, 0.062, 0.83, -0.008, -0.008, -2.729, -0.356, 0.969, -1.054, -0.496, 0.83, -0.496, 0.341, 0.481, 0.341, 0.76, 0.969, -1.752, 0.9, -2.589, -0.496, -1.822, 2.086, -1.194, -1.752, 0.341, 0.76, -0.356, 1.039, -2.31, 0.83, 0.341, -0.636, -0.426, 0.202, 0.481, -2.101, 1.737, 1.179, -0.077, -0.426, 0.9, -1.124, 0.621, 1.249, -0.426, -0.496, -0.636, -0.984, -0.147, 0.76, -0.147, -0.845, -0.426, 0.341, -0.147, -0.356, 0.132, 0.272, -0.705, 0.9, -0.566, -1.473, -1.961, -2.31, -0.915, 1.458, 0.969, -0.217, 1.109, -0.217, 0.83, 1.039, 0.551, 1.249, 1.179, -2.589, 1.179, 0.411, 1.528, 0.202, 0.062, -0.147, 0.411, 0.551, -0.287, -0.077, -2.171, -2.171, 0.202, 0.9, 0.202, 0.76, 0.969, -2.938, -0.496, 0.202, 0.9, 0.341, 0.551, 0.621, 0.062, -2.031, 0.76, 0.969, 0.341, -0.287, -0.008, -0.496, 0.69, 0.341, 0.062, -1.682, -0.287, 0.551, 0.411, 0.062, 1.877, -1.892, 0.202, -1.194, 0.69, 1.318, 1.877, -1.403, -0.008, -1.124, -0.008, 0.341, -0.147, 1.318, -0.147, -2.799, -0.077, -1.543, -1.613, 1.039, 0.132, -0.426, -0.426, 0.481, -1.264, 0.341, 0.9, -0.915, 0.969, 0.132, 0.062, 0.621, -0.636, -0.845, -0.287, 0.551, -2.45, 0.9, 0.551, -0.984, -2.589, 0.411, -0.287, 1.179, 0.132, 1.039, -0.636, 0.551, -0.984, -1.403, -0.356, -0.775, 0.062, -1.892, 0.69, 0.83, 1.039, -2.31, 0.551, -0.845, -0.356, 0.411, 0.551, 1.109, 0.132, -0.287, -0.287, -1.752, 0.551, -0.217, 0.9, 0.132, -0.426, -0.984, -0.147, -0.775, 0.969, -0.566, -2.38, 0.132, -0.008, -1.333, -0.426, -0.147, 1.877, 1.179, 0.272, 0.272, 0.481, -1.124, 0.202, 2.086, -0.566, 1.597, 0.551, -0.636, 0.9, 0.062, -0.217, 1.318, -0.984, 0.202, -2.241, 1.528, 1.318, 0.062, -0.566, -0.496, -0.147, -0.775, 0.76, 1.458, -1.752, -0.356, 0.621, 0.341, 1.318, -1.194, -0.077, -0.287, -0.147, 0.621, -0.984, 0.9, -0.705, -0.426, 0.202, 1.179, 0.202, 0.341, 1.318, 0.411, -1.124, 0.551, 0.69, -1.333, -1.822, 0.341, 1.179, -0.008, -0.077, 0.272, 0.9, 0.132, -0.147, -0.147, 0.481, -0.287, 0.76, -1.264, -0.077, -2.171, 0.969, 1.109, 0.481, 0.272, 1.388, 1.388, 0.621, -0.217, 0.551, 1.528, -0.077, 0.132, -1.752, -0.705, -2.171, 0.411, -1.682, -0.287)
fifaRaw <- c(fifaRaw, 0.272, -1.264, -0.845, -0.775, 1.109, 0.202, -0.287, 0.341, -1.333, 1.597, 0.969, -1.403, 0.272, 0.551, -0.566, 1.737, -0.566, -0.077, 1.039, -0.636, 1.318, 2.226, -0.496, 0.062, -0.636, 0.341, 0.411, 0.481, 0.69, 1.877, 0.481, -0.984, -0.775, 0.272, 1.039, 0.132, 0.551, -0.287, -3.008, -0.356, -0.915, -0.077, 1.318, 0.132, -0.356, -1.054, 0.621, -0.217, 0.062, -1.194, -0.356, -1.054, -0.775, -0.287, 1.109, 1.109, -0.496, -0.077, -0.008, 0.83, 0.411, -0.915, -1.752, -0.077, 0.062, 0.76, 0.341, 0.551, 0.481, 0.621, 0.481, -0.636, -0.287, 0.9, 0.062, 0.83, 0.132, -1.054, -0.705, 1.039, 0.69, -0.356, -0.356, -0.217, -1.613, 0.202, 1.458, -1.054, -0.356, -0.566, 0.481, 0.132, 0.969, 0.202, 1.249, 0.272, -0.426, -0.217, 1.039, 1.109, 0.76, -1.961, -0.077, -0.705, 1.528, 1.249, 0.132, -0.636, 1.18, 0.573, 0.628, -1.854, -0.144, -1.964, -0.695, 0.187, 0.297, -0.199, -0.034, 0.518, 0.408, 1.235, 0.849, 0.077, 1.125, 1.29, 0.959, -1.909, -1.964, 0.573, -1.633, -1.909, 0.242, -1.909, 0.959, 0.573, 1.07, 0.683, -0.806, -1.192, -1.192, -0.971, -1.633, 0.573, -0.034, -1.247, -0.53, 1.014, -0.089, 0.518, -1.192, 0.077, 0.132, 0.408, 1.29, 0.353, -1.743, -0.144, 1.07, -0.364, -0.144, -1.688, 0.739, -1.743, 1.235, -0.144, -1.854, 0.683, 0.959, -0.695, 0.904, -0.144, 0.132, 0.628, -0.144, -0.585, 0.022, 1.621, 0.408, 0.904, 0.022, 1.787, 0.242, -2.019, 0.849, 0.959, -0.971, 1.235, -0.144, -0.916, 0.187, -0.53, -1.688, 1.014, -1.302, -1.743, -0.751, 0.518, -0.089, 0.573, 0.297, 0.242, 0.518, 1.125, -0.806, 0.794, -1.523, 0.849, 0.518, 0.628, 0.132, -1.137, -0.916, -0.916, -1.412, 0.353, 0.022, 1.456, -1.798, -0.585, 0.077, -1.688, -0.806, 1.014, 0.739, -0.585, 1.014, 0.849, 0.518, 1.345, -1.633, 1.4, -1.909, -1.081, -0.695, 0.297, 0.132, -2.129, 0.022, -0.199, -1.026, -0.089, 0.794, 0.628, 0.739, -1.688, -0.585, -1.302, 0.683, 0.242, 1.566, 0.904, 0.132, 0.353, 0.077, -1.633, 0.904, 0.132, 0.187, 0.408, 0.794, 0.022, 0.904, 0.132, 0.959, 1.511, 0.959, 1.125, 0.683, 0.904, -0.144, -0.916, -1.743, 0.518, 0.628, -1.688, -1.798, -1.743, -0.089, 0.849, -1.468, -1.081, 0.242, 0.518, 0.353, 0.463, 0.187, 0.187, -0.199, -1.688, 0.628, -0.364, 0.573, 0.849, 1.29, -1.081, -0.254, 0.187, -1.523, 0.739, -2.074, -1.854, -0.695, 0.408, 0.187, -1.026, -0.034, -2.129, 0.739, -0.254, 0.849, -0.916, 0.739, -0.199, 1.125, 1.345, 1.4, 0.959, 0.849, -0.034, -0.475, 0.077, 1.511, -0.254, 0.849, -0.751, -1.247, -1.578, 0.353, -0.53, 0.297, 1.235, 1.621, -0.53, 0.022, 0.573, 0.518, -1.743, -0.475, 0.463, 0.794, 0.959, -2.019, 0.408, 0.959, -1.743, -0.144, -0.034, -1.854, 0.904, 1.18, 0.739, 1.29, 0.187, -0.144, -0.309, -1.798, 0.518, 0.849, 0.297, -0.695, 0.794, -0.916, 0.463, -0.089, 1.125, -1.743, -0.089, -0.751, -0.585)
fifaRaw <- c(fifaRaw, -1.909, -0.034, 0.959, -1.026, 0.628, -0.53, -0.089, -0.199, 0.683, 0.959, 1.235, -1.909, 0.297, -0.64, -0.475, 0.959, 0.849, -2.35, -0.53, 0.904, -1.743, 0.408, 1.125, 1.125, 0.739, 1.18, 1.456, -1.743, 0.297, 1.125, 1.125, -0.199, -2.129, -1.909, -0.254, 0.628, 1.07, 0.628, -1.688, 1.621, -0.254, -2.185, 0.187, -0.751, 0.959, 1.4, -0.861, 0.408, 0.573, 0.518, -0.64, 0.739, 0.739, -0.53, -0.64, 1.07, 0.959, -0.034, -1.357, 0.904, -0.806, 0.022, 1.07, 1.29, 0.022, 1.456, 0.959, -0.916, 0.849, -1.909, 0.573, 1.125, -1.743, 0.573, -1.247, -1.302, 0.408, -2.019, 0.573, -0.199, 0.794, 0.739, -0.64, 0.518, -1.026, -0.42, 0.408, 0.022, 0.904, -0.254, 0.683, 1.07, -1.633, -0.254, 0.187, -0.64, -1.412, -0.254, 0.794, 0.518, -0.364, -0.53, 1.125, 0.077, 1.18, -0.034, 0.959, 0.849, 0.463, -2.074, 0.077, -1.854, 0.573, -1.688, 0.683, 0.408, 1.235, -0.199, 0.463, -0.806, -0.53, -0.089, 0.628, -0.806, -1.854, 1.787, 0.959, 1.125, -1.357, -0.42, -0.089, -1.743, 0.849, 0.739, 1.621, 0.959, 0.463, -1.357, -1.964, 1.07, 0.408, 0.022, 1.125, 1.235, 0.408, 0.904, -1.798, -1.412, 1.125, -1.302, 0.739, 0.959, -0.695, -0.475, 0.022, 0.739, 0.794, 1.29, 0.904, 0.187, 0.628, 1.014, -2.129, -0.089, 0.683, 0.353, 0.683, 0.297, -1.909, 0.683, -1.854, -0.254, 1.4, 0.683, 1.345, -0.254, -0.089, 0.628, 0.408, -0.034, -0.53, 1.07, -0.806, 0.463, -0.144, 1.07, -1.633, 0.463, 0.794, 1.125, 1.07, -2.24, 0.408, 0.242, 0.187, 0.187, 1.511, -0.585, -0.364, 0.739, 0.573, 1.29, -0.695, -0.585, 0.739, 0.077, -1.854, 0.408, 1.18, -0.144, 0.683, -0.53, -0.53, -1.633, -2.074, 1.345, 0.739, -0.695, 0.573, -1.081, 0.242, 0.408, -0.254, 1.345, 0.959, -0.254, 0.353, -0.144, -0.309, 1.29, 0.463, -1.357, 1.235, -0.089, 0.573, 1.014, -0.089, -0.916, 0.881, -0.454, 0.714, -2.038, -0.454, 0.714, 0.881, -1.204, -1.037, -0.704, -0.787, 0.63, -0.787, 0.38, -1.121, -0.203, 0.13, 0.047, 0.13, -0.203, 0.213, -0.12, -0.037, -0.871, 0.714, -0.037, 0.213, -0.037, 0.047, 0.213, -1.204, -0.62, -0.871, 0.297, -0.037, 1.298, -0.287, 1.881, -1.204, 1.381, 0.63, 0.213, 0.13, -1.538, 1.131, 0.464, 0.881, -1.121, -0.954, 0.297, -2.372, -2.288, -1.288, -0.12, 0.047, 0.547, 1.464, 0.881, -1.288, -0.203, -1.288, 0.797, 0.797, 0.213, -0.454, -0.454, -0.62, 0.213, -1.204, -0.704, 0.38, 1.298, 0.38, -2.955, 0.714, -1.621, -0.537, 0.047, 0.464, 0.547, 0.13, 0.714, 1.298, -1.371, 0.38, 0.213, 0.38, -0.871, -0.287, 1.798, 1.381, 1.047, 1.214, -0.537, 0.881, -2.872, 0.797, 2.131, 1.464, 0.797, -0.037, -1.371, 0.38, 1.131, -0.62, 0.38, -0.287, -0.62, 0.714, 0.797, -1.955, 1.298, 0.714, -0.62, 0.797, 0.047, -0.203, -0.704, 1.214, 0.714, 0.964, -0.12, -2.288, -0.954, -0.287, 0.547, 0.047, -0.12, 1.214, 0.13, -0.287, 0.881, 0.714, 1.131, 0.38, 0.464, -0.287, 0.213)
fifaRaw <- c(fifaRaw, -0.037, 1.965, 0.547, -0.787, -2.288, 0.63, 0.714, 0.797, 1.548, -0.203, 0.797, -0.12, -0.12, 1.464, -0.287, 0.63, 0.213, -0.203, -0.954, -1.454, -0.037, 0.213, -0.037, -0.037, 1.131, 1.548, 0.213, -1.538, -0.287, -0.12, -0.037, -0.954, -1.121, -1.121, -0.704, -0.203, -0.37, 0.547, -1.705, 1.047, -0.203, 1.464, 0.547, -2.538, -1.204, 0.547, 2.048, -0.037, -0.037, 1.381, -0.454, -0.37, 0.797, -1.121, -1.121, -0.704, 0.63, -0.37, -0.287, -0.787, -1.288, -2.455, 0.714, -0.871, -0.12, 0.38, 2.215, -0.871, 1.298, -0.954, 0.13, -0.787, 0.714, 1.214, -0.037, -0.454, -0.203, 0.213, 0.797, -2.705, -0.37, -0.287, 1.214, -0.037, -0.37, 0.38, 0.547, 0.213, -2.038, 0.464, -1.037, -0.537, -0.537, 0.63, 0.297, 1.131, 0.714, -1.371, -1.955, -2.955, 0.63, 0.38, -0.871, 0.797, -0.871, -0.037, 0.881, -0.287, -0.37, 0.047, -0.871, -2.205, -0.537, -0.37, -0.203, -0.037, -2.789, -0.787, 0.38, -1.204, -1.454, 0.797, -0.871, 0.047, -2.038, -0.037, -0.871, -0.203, 2.048, 1.631, -1.204, -0.537, 0.047, 2.215, 0.297, -0.37, -0.203, 0.881, -1.121, -0.704, 1.298, -2.372, 1.965, -0.62, 0.213, -0.287, 1.965, 0.464, 0.38, -0.287, 0.464, -1.121, 0.547, 0.213, 0.13, 0.547, -0.704, -0.454, -0.704, 0.13, 0.547, -2.121, -1.204, -0.62, -0.704, -0.537, -0.12, 1.381, 1.798, -1.454, 1.464, -0.704, 0.964, -0.037, 0.464, -1.037, 0.38, -0.537, 0.38, 0.297, -2.705, -0.037, -0.203, 0.13, -0.037, -0.203, 0.881, -0.287, -0.037, 2.048, 1.047, 0.797, -0.203, -0.704, -1.621, -2.288, 0.047, 0.797, -0.203, 0.797, 0.13, -0.12, 0.297, 1.214, 1.798, -0.787, -2.372, -1.121, 0.547, 1.548, 0.38, 0.547, 0.714, -0.203, 0.13, -0.871, 0.714, -1.037, 0.63, 0.464, 0.797, 0.13, 0.464, 0.63, 0.547, 1.214, -0.537, 0.047, 1.131, 1.047, 0.714, 0.964, -1.621, -0.454, -0.287, -1.371, -0.454, -0.203, 0.464, 0.881, 0.13, 0.13, 0.047, 0.714, 1.464, -1.121, 0.797, -0.12, -0.954, 1.464, 0.213, 1.131, -0.871, 0.881, 1.047, -0.787, -0.871, 0.38, -0.62, 1.047, 0.38, 0.464, -2.372, 0.547, -0.537, -0.12, 0.881, 0.464, -0.37, 0.297, 0.63, -0.787, 0.714, 0.213, -1.037, 0.797, 1.214, 0.464, 0.297, 0.63, -0.12, 0.63, 0.714, 1.965, -0.537, 0.547, -0.954, -1.371, -0.287, 1.047, -0.287, -0.871, -3.289, 0.63, 0.213, -0.62, -1.204, 0.63, 0.547, 1.548, 0.547, 1.548, -0.203, 0.464, 0.881, -1.871, 0.213, 1.131, 0.38, -0.037, -0.454, -1.037, 0.38, -0.704, 0.714, 0.714, -0.62, 0.213, -0.12, -0.37, 0.714, -0.454, -0.454, -0.287, 0.714, 1.298, 0.797, -0.537, 1.298, -2.372, 0.13, 0.047, -0.037, -2.622, 1.381, 1.381, 0.464, 0.547, 0.047, 2.298, 0.714, 1.047, 1.298, 0.38, 1.381, -2.622, -1.204, 0.881, 0.714, -1.288, 0.547, 1.047, -0.704, 2.215, -0.454, -0.454, 0.964, -0.37, -0.537, 0.38, -0.37, 0.13, 1.44, 0.202, -0.151, -2.095, -0.799, -1.153, 0.556, -1.683, 0.792)
fifaRaw <- c(fifaRaw, -0.033, -1.094, -0.917, -0.151, 0.379, 0.556, -0.21, -0.151, 1.145, 0.144, 0.085, -1.27, 1.086, -1.919, -0.151, 0.909, -1.565, 0.438, 1.204, 0.674, 1.086, -0.505, -0.269, 0.556, 0.32, -2.036, -0.033, -0.092, -0.151, -0.092, 0.556, 0.32, 0.556, -2.213, 1.027, 1.44, 0.261, 0.909, 1.322, -1.919, 0.556, 0.202, 0.379, 0.32, -1.27, 0.909, -2.036, 0.85, -0.917, -1.27, -0.21, 0.144, 0.968, 0.615, 0.909, 0.438, -0.092, 0.32, 0.026, -1.683, -0.446, 0.674, -0.563, -0.976, 0.733, 0.144, -2.213, 0.379, 0.733, -0.328, 0.202, -0.387, 0.674, 0.674, -0.505, -1.919, 0.438, -0.328, 0.085, 0.026, 0.85, -0.151, 0.497, 1.734, -0.622, 0.674, -1.153, -0.269, 0.909, 0.968, -0.74, -0.21, 0.556, 0.733, 0.379, 0.438, -0.681, 0.733, 0.085, -0.033, -0.446, -2.861, -0.151, 1.381, -2.272, 0.085, 1.616, 0.202, -1.153, 0.556, 0.144, 0.32, 0.497, -1.977, 1.557, -2.154, 0.085, -0.505, 0.615, 0.438, -1.27, 0.615, 0.379, 0.733, -0.917, 0.379, 1.322, -0.092, -1.801, -0.269, -0.033, 1.204, 0.379, 0.556, 1.086, -0.21, 0.792, 1.499, -1.27, 0.085, 0.202, -0.563, 0.438, 0.615, -0.622, 0.438, 0.144, 1.322, 0.438, 0.674, 0.32, 0.497, 0.438, 0.792, 0.792, -1.035, 0.202, -0.269, -1.86, -1.388, -1.919, -0.033, 0.026, -0.269, -1.506, 0.909, 1.44, 0.674, 0.615, -0.21, 0.909, -0.681, -2.272, -0.622, -0.033, -0.446, -0.151, 1.322, -0.858, -0.622, 0.438, 0.909, 0.792, -2.154, -2.213, -0.269, -0.446, 0.497, 0.261, 0.85, -2.331, -1.624, -1.329, 0.968, 0.438, 1.263, -0.092, 0.497, 0.026, 0.202, 0.615, 0.968, -1.094, 0.615, 0.556, 0.438, 0.438, 1.263, -1.035, -0.092, -0.446, 0.792, 0.085, 0.733, 0.144, 1.557, 0.026, -0.328, -0.269, -0.387, -2.036, 0.792, 0.085, -0.151, 1.263, -1.624, 0.497, 0.261, -2.567, 0.261, -1.742, -1.919, 1.086, 0.674, 0.202, 1.086, 0.085, -0.681, -0.505, -0.21, 0.379, 0.615, 0.615, -0.21, 1.381, 0.144, 0.32, 0.261, -0.446, -2.39, 0.379, -0.563, -1.742, -2.036, -1.801, 0.32, 0.792, 1.381, 1.204, 1.793, 0.32, -1.035, 0.261, 1.145, -2.508, 0.438, 0.144, -0.269, 0.144, 0.497, -2.979, 0.85, 1.675, -1.094, 0.792, 1.086, -1.683, 1.086, 0.909, -0.681, -2.39, 0.026, 0.202, 1.499, 0.32, -2.036, -2.449, -1.683, 1.44, 1.145, -0.976, -2.449, 0.792, 0.379, -2.743, 0.085, -0.033, 0.909, 0.615, 1.145, 0.438, -0.092, 0.202, 0.085, 0.379, 0.615, -0.622, -1.27, 0.497, 1.086, 0.379, 0.026, 0.085, -1.506, -1.094, -0.151, 1.027, -0.622, 0.497, 0.085, -0.387, -0.269, -1.919, -0.092, -0.269, -1.624, 1.145, -0.446, 0.32, 0.085, -2.449, -0.681, 0.85, 0.026, 0.674, 1.675, -0.21, 0.438, 0.32, 1.263, 0.792, 0.909, 0.497, 0.497, 0.615, -1.094, -0.505, 1.027, 0.144, -1.565, -0.505, 0.909, 0.32, -0.21, 0.261, -0.033, 0.556, 0.144, -0.033, 1.499, 0.556, 0.379, -2.095, -0.033, -2.39, 0.32, -0.387, 1.145, 1.145, 0.85, -0.799, 0.379, -0.033, 0.968, -0.269, 0.674, -0.858, -2.036, -0.681, -1.742, 0.792, -0.446, 0.32, -0.21, -2.743, 0.261, 0.144)
fifaRaw <- c(fifaRaw, 1.145, 0.792, 0.144, 0.733, -1.212, 0.909, 0.085, 0.144, 0.792, 0.556, 0.438, 1.086, -1.919, 0.085, 0.556, -0.151, 1.263, 0.026, 0.202, 0.556, -0.446, 1.616, 0.202, 0.85, 0.497, -0.151, 1.322, 0.438, -2.154, 1.322, 0.144, 1.263, -0.033, -0.033, -2.92, 0.792, -1.565, -0.74, 0.674, -0.505, 0.674, 0.32, 1.263, 0.438, 1.381, 1.557, 0.379, 1.44, -0.622, 0.556, 0.497, 0.968, -1.447, 0.615, 0.497, 1.204, 0.379, -1.447, -1.742, 0.026, 0.556, -0.033, 1.322, -0.446, -0.21, 0.085, 1.086, 0.026, -0.799, -0.622, 1.145, 0.144, -1.565, -0.033, 0.909, -0.269, 1.734, 0.379, -1.683, -1.329, -1.624, 1.263, 0.968, -0.21, 0.379, 0.32, 1.027, -0.033, -0.446, 0.261, 0.556, 0.438, 0.085, 0.085, 0.497, 0.968, 0.615, -2.154, 1.204, -0.21, 0.674, 0.85, 0.085, -0.328, -0.138, -0.787, -0.381, -2.164, -1.597, -0.868, 0.753, -2.732, 0.753, -1.111, -1.678, 0.348, -0.057, 0.915, -1.192, 0.429, 0.834, 0.51, -0.3, -0.138, -0.543, 0.753, 0.105, -1.759, -0.057, 0.186, 0.105, 0.591, 0.915, -0.219, 1.158, -0.868, 0.186, 0.591, 1.32, 0.267, -0.543, 0.429, 1.32, 0.105, 0.915, 0.186, -0.868, -0.057, -0.462, -2.326, 1.077, -0.138, -1.759, -0.787, 0.105, -0.706, -0.949, -0.057, 1.077, 0.348, -2.326, -0.138, -1.84, -0.219, 0.186, 0.672, 0.105, 0.105, 0.51, -0.625, -0.625, -2.245, -1.516, 0.267, 0.105, 0.996, -2.488, 0.591, -0.3, -2.002, -0.787, 1.239, 0.51, 2.212, 2.131, 0.915, 0.105, 1.077, -0.138, 0.186, -0.706, -1.678, 1.483, 1.158, 1.077, 1.239, -1.759, -2.651, -0.462, 0.348, 0.672, -2.975, 0.51, -0.138, -0.138, -0.3, 0.51, 0.348, -0.219, 1.564, -0.219, -0.138, 0.834, 0.186, -1.111, 1.32, 0.024, -0.381, 0.51, 0.267, -0.625, -2.164, 0.591, 1.564, 0.024, -0.462, -1.921, 0.024, 0.51, 0.105, 1.726, -1.921, 1.483, 1.239, 0.186, -0.3, 0.51, -1.597, 1.726, 1.402, -0.462, 0.186, -1.435, 0.51, -0.057, 1.077, -1.03, 0.51, -1.759, 0.834, 0.105, -0.3, 0.024, -1.111, -0.381, 0.672, 1.564, 1.158, 0.915, -0.706, 0.753, 1.645, 0.996, 0.834, 0.672, 0.753, 0.834, -0.219, -1.597, -1.273, -0.625, -0.138, 0.267, 0.915, 0.429, -1.354, -1.03, 0.024, -1.759, 0.51, 0.348, 0.591, -1.435, 0.51, -1.678, -0.462, -1.678, 0.348, 0.186, 1.158, 0.672, 0.996, -0.868, -0.625, 1.483, 0.672, -0.381, -0.706, -0.706, 0.186, -0.787, -0.138, -1.759, -1.354, 0.996, -2.651, -0.3, -0.625, 0.672, -0.868, 0.024, 0.591, 1.158, 0.024, -0.462, 1.077, 0.105, 1.077, 0.267, -0.381, 0.186, 1.969, -0.787, 0.51, 0.348, -0.219, -2.245, 0.915, 1.32, 1.564, -0.3, -2.732, -2.894, -1.678, 0.186, 0.834, 0.834, 0.915, -0.3, -1.192, 0.429, -0.219, 0.753, 0.348, -0.057, -0.949, 0.591, 0.024, 0.915, -0.462, 0.672, -0.787, -1.111, 0.429, 0.024, -0.3, 0.348, 0.591, 0.672, 0.105, 1.564, -0.625, -0.057, 0.024, -1.03, 0.186, -0.219, -0.625, 0.51, -0.543, 1.077, 0.105, 0.267, -0.219, 0.834, 1.564, 1.239, -0.949, 0.672, 0.915, -2.326, 0.267, -0.138, 0.591, -0.057, 0.915, -0.057, 0.105, 0.996, -0.462, 0.996, 0.429)
fifaRaw <- c(fifaRaw, 0.915, -1.516, 0.591, 0.672, 0.186, 0.834, -0.3, -1.111, -0.381, 1.158, 0.672, -0.057, 0.834, 0.51, -0.057, -1.192, -0.057, 0.753, -2.245, 0.51, 0.834, 0.267, -0.949, 1.483, -0.219, -0.949, 0.51, -2.245, -1.678, 0.996, 0.024, 0.186, -0.625, 0.348, 0.591, -0.949, 1.888, -1.678, -2.732, 0.996, 0.672, 0.024, 1.807, 0.105, 0.753, -1.111, 0.915, 1.077, -0.381, 0.591, 0.105, -1.759, 1.807, 1.077, 1.645, -0.462, 1.239, -0.625, 0.267, 0.672, -0.219, -0.706, 1.402, -0.138, -1.273, 0.105, 0.267, -2.083, -0.138, 0.996, 0.024, -0.462, 0.348, 0.429, -0.787, 0.591, -1.111, 0.915, 0.915, 0.753, 0.591, 0.186, -0.381, 0.267, -0.462, -0.625, -0.381, -2.083, 0.429, 0.834, -0.381, -1.597, -1.516, 0.591, 0.915, -2.651, 1.402, -0.787, -0.381, 0.996, 2.212, 0.996, -0.787, 0.753, -0.625, -0.381, 0.996, 1.402, -0.381, 0.267, 0.105, 0.105, -2.002, -1.435, -1.435, 1.239, 0.753, 0.834, -0.057, -1.111, -0.138, -0.625, -0.219, 1.564, -0.462, -0.625, 0.915, 0.753, -0.057, 0.834, -0.706, 0.753, -0.381, -0.625, -0.543, 1.077, -0.706, -0.057, -1.111, 1.158, -0.543, 0.51, -1.84, 0.672, -0.057, -0.462, 0.429, 0.51, 0.348, 1.402, -0.219, 0.996, 0.591, 1.077, 0.348, 1.645, 0.429, 0.753, -1.597, 0.591, 0.672, 1.239, 1.158, -0.543, 0.429, -0.381, 0.591, 0.51, 0.024, -0.381, 0.996, -1.597, -1.516, -0.787, 0.429, 0.672, 0.753, -0.787, 1.239, -1.435, -0.057, 0.105, 1.077, -1.597, 0.591, 0.996, -0.3, -0.3, -2.002, 1.402, -0.057, 0.591, 0.672, 0.753, 0.348, -0.706, -0.787, 1.077, -0.138, -0.462, 1.726, -0.138, -1.516, 0.51, -0.219, -0.706, 0.915, -1.354, -0.381, -0.625, -0.3, 1.32, 1.229, 0.426, 0.627, -1.481, -0.377, -1.481, -0.226, 0.326, 0.075, 0.577, -0.477, 0.778, 0.778, 0.527, 1.279, 0.175, 1.229, 1.229, 0.978, -1.13, -1.581, 0.276, -1.33, -1.18, -0.828, -1.531, 0.928, -0.377, 0.727, 0.978, -1.079, -0.979, -0.728, -0.527, -1.481, 0.828, 0.276, -0.728, -1.431, 0.978, -0.477, 0.627, -1.28, 0.276, 0.125, 0.627, 1.129, 0.226, -1.732, 0.226, 1.581, 0.527, 0.125, -1.782, 0.677, -1.481, 1.129, -0.778, -1.933, 0.326, 1.129, -0.577, 1.079, -0.628, 0.426, 0.878, -0.126, -1.33, 0.376, 0.828, 0.577, 0.376, -0.527, 2.283, 0.075, -1.531, 1.029, 1.631, -0.176, 1.079, -0.678, -1.23, 0.125, -1.28, -1.782, 1.029, -1.481, -0.879, -1.029, 0.276, -0.728, 0.727, -0.778, 0.978, 0.928, 0.778, -0.979, 1.029, -0.728, 0.577, 0.426, 1.179, 0.476, -1.28, -0.828, -1.079, -1.079, 0.527, -0.678, 1.48, -1.782, -1.13, -0.226, -1.882, -0.477, 1.179, 0.677, -0.276, 0.828, 0.627, 0.426, 1.43, -1.631, 1.179, -1.732, -1.28, -0.929, 0.878, 0.276, -1.682, 0.276, -0.276, -0.377, -0.076, 0.226, 0.175, 0.727, -1.38, -0.427, -1.431, 0.828, -0.076, 1.882, 1.129, -0.076, -0.226, 0.627, -1.732, 0.778, -0.176, 0.577, 0.577, 0.577, -0.176, 0.878, 0.627, 0.476)
fifaRaw <- c(fifaRaw, 1.279, 1.029, -0.477, 0.627, 0.426, 0.577, -1.18, -1.732, 0.426, 0.727, -1.481, -1.732, -1.631, 0.226, 0.828, -0.678, -1.732, 0.125, 0.627, 0.527, 0.928, 0.075, -0.025, -0.025, -2.033, 0.778, -0.577, 0.727, 0.276, 1.179, -0.728, -0.076, 0.376, -0.527, 0.778, -1.983, -1.631, -1.18, 0.878, 0.476, -0.628, 0.778, -2.133, -1.832, -0.527, 0.878, -1.18, 0.476, -0.226, 1.029, 0.928, 1.38, 1.179, 0.878, -0.377, -0.377, -0.126, 1.631, -0.126, 0.878, -1.481, -0.929, -0.728, 0.677, -0.828, 0.677, 0.677, 1.53, -1.38, 0.276, 0.778, -0.327, -1.933, -1.079, 0.075, 0.476, 0.978, -1.732, 0.627, 1.38, -1.983, -0.276, 0.426, -1.531, 0.376, 1.279, 0.828, 1.43, 0.376, -0.427, -0.025, -0.778, 0.727, 0.878, 0.627, -0.879, 0.577, -1.28, 0.677, -0.678, 1.029, -1.882, 0.125, -0.126, -1.13, -1.983, -0.527, 1.179, -0.778, 1.129, 0.577, -0.327, 0.426, 0.025, 0.075, 1.079, -1.832, 0.376, -1.732, -0.076, 0.527, 1.129, -1.732, 0.025, 1.079, -1.33, 0.727, 0.928, 1.33, -0.577, 1.179, 0.978, -2.033, 0.878, 1.38, 1.129, 0.426, -1.631, -1.933, -0.025, 0.376, 1.33, -0.327, -1.631, 2.082, -0.577, -2.033, -0.226, -1.28, 1.279, 1.781, -1.13, 0.577, 0.778, -0.628, -1.18, 1.681, 0.978, -0.025, -0.076, 0.527, 1.38, 0.276, -0.879, 0.426, -1.732, 0.175, 1.179, 1.229, -0.327, 1.38, 1.079, -1.13, 0.878, -1.581, 0.276, 0.577, -1.732, -0.327, -1.079, -0.828, 0.577, -2.033, -0.377, 0.075, 0.928, 0.527, -0.276, 0.928, -1.33, -0.628, 0.276, 0.125, 0.878, -0.176, 0.527, 1.029, -1.33, -0.025, -0.176, -0.226, -1.481, 0.025, 0.778, -0.176, -0.628, -0.628, 1.279, -0.377, 1.279, 0.025, 1.33, 0.928, 0.978, -1.882, 0.125, -1.732, 0.778, -0.929, 1.129, 0.075, 0.928, -0.276, 0.376, -1.029, 0.376, 0.476, 0.727, -0.527, -1.782, 1.581, -1.23, 1.38, -0.879, -1.18, -0.327, -1.33, -0.427, 0.577, 1.129, 1.079, 1.079, -1.079, -1.431, 1.129, -0.577, 0.075, 1.029, 1.029, 0.778, 0.778, -1.631, -0.628, 1.279, 0.376, 1.179, 0.878, -1.28, -0.628, 0.928, 0.727, 0.828, 1.279, 1.129, 0.928, 1.079, 1.179, -1.481, -0.327, 0.978, 0.727, 1.229, 0.376, -1.882, 0.778, -1.732, 0.276, 1.129, 0.476, 1.43, -0.879, 0.226, 0.025, -0.025, 0.376, -0.276, 1.079, -1.23, 0.677, 0.025, 1.029, -1.682, -0.226, 0.627, 0.326, 0.978, -1.732, -0.076, 0.426, -0.076, 0.276, 1.33, -1.029, 0.175, 0.677, 0.928, 0.928, -0.728, -0.628, 0.527, 0.276, -1.732, 0.426, 0.878, -0.728, 0.978, -0.979, -1.33, -1.732, -1.732, 0.627, 0.778, -1.23, 0.778, -1.029, 0.727, -0.327, -0.527, 1.079, 1.53, 0.577, -0.226, -0.527, -0.678, 0.878, 0.677, -1.933, 1.179, -0.427, 0.727, 1.38, 0.125, -1.13, -0.79, 0.142, -1.373, -2.189, 0.434, -1.315, 0.725, -0.441, 1.016, -0.266, -0.965, -0.557, 0.375, 0.608, 0.026, 0.084, 0.725, 1.424, 0.084, -0.557, -1.723, 1.366, -1.373, -0.266, 0.375, -1.606, 1.075, 1.249)
fifaRaw <- c(fifaRaw, 1.424, 0.492, 0.55, 0.2, 0.375, 0.725, -1.198, 0.375, -0.324, 1.133, 0.725, 1.016, 1.249, 0.492, -1.548, -1.315, 0.725, -1.198, 0.317, 0.084, -1.256, 0.55, 0.841, 0.492, -0.79, -1.14, 1.482, -1.897, 1.249, 0.55, -2.247, 0.026, -0.207, 1.133, -0.441, 0.841, 0.084, -0.207, 0.608, -0.324, -0.732, -0.441, -0.324, 1.075, -1.548, 0.259, 1.133, -1.431, 0.259, -1.14, 0.725, 0.958, 0.608, -0.79, 1.016, 0.084, -0.907, 0.958, -0.207, -0.441, -0.557, -0.033, 0.608, 0.783, 0.958, 0.434, 0.55, 0.841, -0.266, -1.14, 1.016, 0.026, 0.084, 0.2, -0.557, -0.149, 0.958, -0.207, 0.492, -0.615, 0.142, -0.149, -2.13, 0.434, 1.366, -2.189, -0.149, 1.657, 0.2, -1.198, 0.317, 0.026, 0.667, 0.375, -1.839, 1.133, -0.674, -0.382, 0.958, 0.9, 0.725, -1.315, -0.091, 0.608, 0.2, -1.14, -0.615, 1.949, -0.79, -2.072, -0.557, 0.142, 0.841, 0.783, 0.667, 0.317, -0.732, 1.133, 1.016, -1.082, 0.375, -0.557, -1.373, -0.033, 0.55, 0.9, 1.191, -0.091, -0.441, 1.482, 0.259, 0.841, 0.667, -0.266, 1.133, 0.259, -1.082, -0.79, -0.033, -1.606, -0.674, -1.14, 0.55, -0.382, -0.091, -1.023, -0.382, 1.191, -0.441, 0.317, 0.434, 0.492, -0.499, -1.839, -0.965, 0.841, 0.026, 1.133, 0.259, 1.308, -0.091, 0.2, 1.133, 1.016, -1.606, -2.305, -0.557, 0.2, -0.615, 0.667, -1.198, -2.538, -1.315, -0.615, 1.133, 0.958, 0.725, 0.084, -0.674, 0.55, 1.424, 0.55, -1.431, 1.016, 0.667, 0.142, 0.667, -1.489, -0.091, 1.075, -0.441, 0.084, 0.783, 0.2, -1.023, 0.375, 2.24, 0.841, -0.732, -1.315, -0.382, -2.189, 0.841, 1.016, 0.375, 1.191, -2.247, -0.382, 1.075, -1.606, 1.191, -0.79, -1.548, 0.026, -0.091, 0.55, 0.841, -0.557, 0.783, -1.373, -0.441, -1.373, -1.489, 1.249, -0.033, 1.716, 0.783, -1.664, 1.075, 0.317, -1.839, 0.434, -2.072, 0.084, -1.606, -0.324, 0.9, 0.317, -0.033, 0.841, 1.308, 0.841, 0.434, 1.949, 1.249, -1.839, -0.033, 0.2, -1.256, 1.133, 0.55, -1.082, 0.026, 1.308, -1.198, -1.606, -0.965, 0.55, 1.075, 1.133, 1.599, -2.189, 0.958, 1.89, 0.841, -0.149, -1.198, -2.305, -1.315, 1.424, 1.308, -1.373, -2.305, 0.667, -0.615, -1.664, 0.2, 0.9, 0.375, 0.084, 0.9, 0.841, -0.499, 0.9, -0.033, 0.608, 1.133, -1.256, -1.14, -0.091, 0.492, -0.207, 0.55, -1.373, -1.606, -0.674, 1.366, 0.492, 0.259, 1.657, -0.674, -0.324, -0.674, -2.13, -0.79, -1.373, -1.664, 0.958, 0.2, -0.324, 0.55, -1.723, 0.375, 1.599, -0.266, -0.091, 1.191, -0.79, -0.324, 0.667, 0.608, 0.725, 1.075, -0.149, 0.608, -0.674, -1.956, -0.965, -0.324, 0.492, -1.198, -1.315, 0.2, 0.841, 0.084, 0.667, 0.55, 0.317, -0.557, 1.774, 1.133, 0.667, -1.664, -0.907, -0.091, -2.014, 0.259, -0.732, 0.259, 0.725, -0.207, 0.317, -0.79, 0.608, 0.841, -0.907, 0.55, -0.091, -1.373, 1.89, 1.016, -1.14, -0.499)
fifaRaw <- c(fifaRaw, 0.783, 0.375, -0.907, 0.783, 1.308, 1.016, 1.075, 0.841, -0.091, -2.305, 0.259, 0.259, 1.075, 1.249, -0.557, 0.725, 1.075, -1.781, 0.142, 0.841, 0.084, 0.841, -0.324, 1.191, 0.841, -0.499, 0.958, -1.198, 1.133, 0.2, -1.082, 0.608, 0.608, -1.023, 0.375, -0.848, 0.725, -0.965, 0.317, -2.422, 0.084, -1.956, -1.606, 0.725, -1.664, 1.191, 0.725, 1.075, 1.191, 1.075, 0.55, 0.841, 1.075, -0.499, 1.308, -1.14, 1.249, -2.014, 0.667, 0.142, 0.667, 1.016, -1.548, -1.256, 0.259, 1.075, 0.492, 1.308, 0.142, -1.723, -0.674, 0.958, -0.207, 0.2, 0.434, -0.033, 0.142, -2.364, -0.848, 0.9, -0.324, 0.841, 1.716, -0.848, -1.14, -1.664, 1.89, -0.033, 0.725, -0.149, 0.55, 0.2, -0.907, 0.026, 0.142, 0.317, -0.033, 0.375, -0.674, -0.149, 0.492, -1.082, -1.198, 0.608, -0.324, 1.482, -0.965, 0.434, 1.075, -1.686, 0.133, -0.537, -1.207, 0.277, -1.351, 0.516, -1.159, 0.085, -0.25, -1.255, 0.324, -0.106, 0.564, -0.968, -0.585, 0.994, 0.324, 0.851, 0.324, -1.351, 0.899, -1.063, 0.037, 0.659, -1.542, -0.681, 0.803, 1.042, 1.138, 0.468, 0.277, 0.516, 0.277, -1.255, -1.063, -0.872, 1.138, 0.851, 1.186, 1.234, 1.329, -1.542, 0.899, 0.899, -0.728, -1.016, 0.803, -1.59, -0.25, 1.377, -0.537, -1.255, -1.542, 0.611, -1.111, 1.09, 0.468, -1.829, -1.255, 0.564, 1.425, -1.494, 0.994, 0.324, 0.851, -0.202, 0.803, -0.872, 0.277, -0.681, -0.728, -1.063, 1.664, 0.946, -1.063, 0.037, 0.516, 1.138, -0.92, 0.946, 0.516, 0.994, 0.277, -1.207, -0.681, 0.181, 0.372, 0.229, 1.234, 1.473, 0.42, 0.994, 0.564, 0.085, -0.968, 0.564, -1.255, 0.899, -1.063, -1.255, 0.564, 0.707, -0.011, 0.994, -0.058, 0.659, -0.25, 1.09, 0.899, -1.159, 0.994, 1.616, -1.638, 0.707, 0.994, 0.372, -1.063, -0.154, -1.159, 0.611, 0.755, -1.733, 0.946, -1.111, 0.085, 1.234, 0.803, 1.09, -1.446, 0.803, 0.468, 0.42, -0.776, -0.489, 0.707, -1.063, -1.686, 0.468, 0.803, 1.042, 0.946, 1.042, 1.138, -1.159, 0.899, 0.659, -1.207, 0.324, -0.633, -1.542, -0.489, 0.851, 1.138, 1.281, 0.181, -1.446, 1.186, 0.803, 1.186, 0.803, -1.016, 0.468, 0.946, -1.59, -0.728, -0.968, -1.207, -1.542, -1.111, 0.611, 0.133, 0.611, -1.159, 0.803, 0.372, -1.063, 0.707, -0.154, 0.803, -0.92, -1.829, -0.776, 0.994, 0.229, -1.398, 1.042, 0.42, -0.441, 0.372, 0.946, 0.564, -1.925, -1.494, 0.181, 0.899, -0.633, 0.659, 1.138, -1.829, -1.638, -0.824, 0.277, 0.851, -0.776, 0.516, -1.542, 0.899, -0.585, 1.09, -1.446, 1.281, 1.09, 0.803, -0.441, 0.085, 0.851, 0.707, 0.324, 0.372, 0.899, 0.085, 0.229, 0.899, 0.946, 0.899, -1.063, -1.063, -1.398, -1.207, 0.707, 0.994, -1.398, 1.186, -1.111, -0.776, -0.537, -1.781, 1.186, -0.824, -1.733, 0.468, -0.633, 0.516, 1.042, -0.872, 0.803, -1.59, 0.372, -1.063, -1.686, 1.09, 0.564, 0.803, 0.946, -1.063, 1.138, 0.133, -0.92, 0.372, -0.537, 0.516, -1.686, -0.346, 0.851, 0.611, 0.755, 0.946, 1.616, 0.659, -1.207, 1.808, -1.111, -1.303, 0.181, 0.803)
fifaRaw <- c(fifaRaw, -1.351, -1.398, -0.776, -1.063, 0.468, 1.377, -1.159, -0.633, -1.351, 0.181, 1.473, 0.899, -0.824, -1.638, 1.042, 1.616, 1.281, -1.111, -1.351, -1.925, -1.016, 1.138, 1.377, -1.255, -1.446, 0.516, 0.277, -1.59, 0.564, 0.755, -0.25, -0.489, 0.803, 0.946, -1.303, 0.611, 0.851, 0.899, 1.09, -1.255, -1.446, 1.042, 0.803, -0.824, 0.707, -1.398, -1.686, -1.494, 1.281, 0.372, -0.25, 1.616, -0.537, 0.324, -0.25, -1.351, -1.111, -1.351, -1.59, 0.564, 0.707, 0.707, 0.755, -1.59, 0.755, 1.281, 0.277, -0.489, 0.803, -1.303, -0.106, 0.899, 0.899, 0.516, 1.234, 0.324, 0.659, 0.229, -1.303, 0.037, -0.393, 1.09, -1.638, -1.59, 0.994, 1.138, 0.468, 0.946, 0.851, 0.372, -0.92, 1.76, 0.946, -1.733, 0.277, -1.59, -1.207, -1.59, -0.106, 0.229, 0.229, 1.09, -0.537, -1.159, -1.303, 0.707, 0.851, -1.063, 0.659, 0.468, -1.686, 0.516, 1.521, -0.585, -0.106, 0.946, 0.707, -0.92, 1.281, 1.09, 1.09, 1.138, 1.329, -0.011, -1.303, -0.728, 0.037, 0.707, 0.707, 0.707, 0.229, 1.09, -1.207, 0.707, 1.09, 0.372, 1.042, -1.063, 0.803, 0.994, -1.063, 0.707, -1.59, 1.569, 0.899, 0.564, 0.324, -0.872, -1.494, 0.755, -0.441, 0.611, -0.968, -0.393, -1.877, 0.659, -1.829, -1.542, 0.946, -1.303, 1.234, 0.899, 1.281, 1.234, 1.234, 0.564, 0.564, 1.281, 0.516, 0.611, -1.303, 0.851, -1.686, 0.659, 0.516, 0.564, -0.298, -0.968, -1.255, -1.686, 0.037, 0.707, -0.537, 0.133, -1.255, -0.011, 0.899, -1.398, 0.755, -0.154, -0.441, 0.181, -1.111, -1.686, 1.281, -1.542, 0.851, 0.468, -1.59, -1.207, -1.207, 2.047, -0.011, 0.611, -0.298, 0.659, 1.281, -1.303, -0.154, -0.872, -1.207, 0.946, 0.803, 0.516, -0.154, -0.489, -1.303, -1.207, 1.856, 0.229, 1.138, -0.25, 0.611, 0.707, 1.227, 0.436, 0.881, -1.738, -0.848, -1.688, -1.293, -0.453, 0.535, 0.584, -0.404, 0.881, 0.683, 0.239, 1.177, 0.14, 1.029, 1.573, 0.98, 0.14, -1.441, 0.337, -1.787, -0.008, 0.09, -1.886, 0.634, 0.733, 0.288, 0.683, -1.342, 0.337, 0.041, -1.243, -1.54, 1.326, 0.337, -0.008, -0.7, 0.584, -1.046, 0.436, -2.034, 0.535, 0.782, 1.128, 1.078, 0.584, -1.836, -0.107, 0.98, 0.535, 0.288, -1.639, 0.98, -1.688, 0.93, -0.404, -2.182, 1.276, 0.387, 0.337, 0.98, 0.634, 0.337, 0.387, 0.14, -1.293, -0.305, 0.683, 0.486, 0.634, 0.14, 1.474, 0.782, -1.738, 0.387, 0.535, 0.189, 1.326, -0.552, -1.194, 0.239, 0.584, -1.787, 0.634, -0.996, -0.404, -0.947, -0.255, -0.947, 0.782, 0.288, 0.337, 0.831, 1.523, -0.996, 1.276, -0.107, 0.387, 0.881, 0.831, 0.683, -1.243, 0.387, -1.243, 0.14, 0.337, -0.552, 0.733, -2.281, -1.243, 0.288, -2.182, -1.342, 0.831, 0.634, -0.206, 0.337, 0.782, -0.601, 0.93, -1.688, 0.733, -1.688, -1.293, -0.255, 0.98, 0.535, -1.738, 0.535, -1.046, -0.206, 0.189, 1.029, 0.239)
fifaRaw <- c(fifaRaw, 0.535, -2.133, -0.996, -0.058, 0.634, 0.041, 1.177, 0.683, 0.436, 0.436, 1.177, -1.836, 1.029, 0.337, 0.535, 0.93, -0.255, -1.145, 0.535, 1.029, 0.881, 1.128, 0.535, -1.54, 0.436, 0.93, -0.305, 0.14, -1.738, 0.831, 0.337, -1.738, -1.886, -1.738, 0.486, 1.078, 0.041, -1.639, -0.157, 0.683, 0.782, 0.683, 0.486, 0.189, -0.157, -2.034, 0.239, -0.354, 0.733, 0.831, 0.288, -0.7, 0.189, -0.157, -0.601, 1.177, -2.034, -1.688, -0.898, 0.634, 1.128, 0.041, 1.029, -2.133, -1.935, -0.008, 0.683, 0.584, 0.98, -0.404, 0.782, 0.98, 1.326, 0.634, 0.239, -0.947, 0.683, -0.157, 1.474, 0.387, 0.733, -1.293, -0.255, -0.157, 0.782, -0.799, 0.288, -0.7, 0.387, -1.095, 0.98, 0.486, 0.189, -2.232, 0.387, 0.535, 0.634, 0.436, -1.935, 0.634, 1.029, -2.232, -0.651, 0.387, -2.034, 0.041, 1.128, 0.535, 0.387, -0.206, -1.046, -0.206, -0.354, 0.436, 0.93, -0.157, -1.046, 0.14, -1.293, 0.337, -0.947, 0.881, -2.182, 0.634, -0.058, -1.392, -2.083, 0.14, 0.09, 0.387, 0.93, 0.881, 1.029, 0.436, 0.535, -1.243, 1.523, -1.935, 0.239, -0.601, 0.09, 0.535, 1.375, -1.886, -0.799, 1.177, -1.985, 1.326, 1.523, 1.029, 0.733, 0.782, 1.622, -2.133, 0.486, 0.98, 1.227, 0.288, -1.589, -2.133, 0.09, 0.782, 0.239, 0.535, -1.441, 1.82, 0.337, -2.232, 0.337, -1.194, 1.029, 1.474, -1.046, -0.749, 0.93, 0.239, -0.996, 1.523, 0.634, 0.387, 0.189, 0.683, 0.584, 0.239, -0.7, 0.683, -1.886, 0.782, -0.848, 1.573, 0.09, -0.058, 1.078, -1.342, 0.634, -1.836, 0.881, 0.288, -1.886, 0.337, 0.881, -1.441, 0.189, -2.182, -0.552, -0.848, 0.782, 0.881, 0.041, 0.486, -1.342, 0.14, 0.634, 0.584, 1.128, -0.058, 0.337, 1.029, -1.589, -0.354, 0.337, -0.601, -1.639, 0.189, 1.078, 0.486, -0.7, -0.305, 0.634, 0.041, 1.227, -0.404, 0.486, 0.831, 0.831, -1.441, 0.288, -1.886, 0.535, -0.058, 0.486, 1.177, 1.029, 0.189, 0.733, -1.342, -0.255, 0.189, 0.14, -1.145, -1.738, 0.93, -1.441, 1.078, -0.305, 0.041, -0.255, -1.935, -0.206, -0.157, 0.337, 0.436, 0.782, -0.404, -1.688, 1.424, 0.288, -0.058, 0.881, 0.733, 0.337, 1.128, -1.935, 0.436, 0.733, 0.733, 0.881, 1.177, -0.947, -0.404, 0.239, 0.683, 1.029, 0.93, 0.337, 0.584, 0.535, 1.029, -1.738, 0.634, 1.128, 0.782, 1.128, 0.09, -2.232, 0.387, -2.083, -0.255, 0.881, 0.98, 1.029, -1.293, 0.535, -0.404, 0.189, 0.288, -0.206, 0.881, -0.996, -0.157, 0.387, 1.177, -1.935, 0.239, 0.93, 0.436, 1.177, -1.589, -1.836, 0.337, -0.453, 0.337, 1.128, -1.194, -0.008, 0.782, 0.337, 1.227, -1.342, -0.749, 0.93, 0.387, -1.738, 0.436, 1.227, 0.584, 0.634, -0.947, -1.886, -1.787, -1.589, 0.535, 0.782, -0.898, 1.276, 0.09, 0.189, 0.733, -0.354, 1.276, 1.128, 0.782, -0.848, -1.54, -0.354, 1.177, 0.584, -1.738, 0.337, -0.354, 0.288, 1.029, 0.041, -1.095, 0.517, 0.517, 0.731, -1.907, 0.374, -0.838, -1.123, 0.16, 0.089, 1.016, 0.517)
fifaRaw <- c(fifaRaw, 0.232, 0.374, 0.588, 1.586, -1.693, 1.658, 1.301, 1.016, -0.41, -1.693, 0.588, 0.374, -1.693, -0.125, -0.41, 0.588, 0.731, 0.873, 1.515, -1.337, -0.766, -1.265, -1.479, 0.089, 0.659, -0.053, -0.909, -1.194, 0.873, -1.693, 0.802, -0.553, 1.016, 0.374, 0.731, -0.053, 1.158, -3.119, 0.945, 1.8, 0.446, 0.873, -1.622, 0.588, -0.624, 0.659, -1.337, -0.838, 0.873, 0.731, -0.125, 0.374, -0.053, 0.089, 0.945, 0.16, -1.551, 0.802, 0.232, 0.089, -0.339, 0.517, 2.299, 0.517, -1.052, 0.374, 1.016, 0.588, 0.16, -0.695, -1.622, -1.052, -1.194, 0.446, 0.659, -1.836, -0.481, -1.693, 0.374, 0.089, 0.089, -0.766, 1.23, 0.945, 1.301, -1.693, 1.301, -0.41, -0.909, 0.303, 1.087, 0.16, -1.265, -1.052, -0.41, -0.41, 0.018, -0.98, 1.444, -1.622, -1.693, 1.515, -1.479, -1.337, 0.446, 0.659, -0.909, 0.588, -0.196, 0.303, 1.016, -1.479, 1.158, -1.408, -1.693, -0.553, 1.586, 0.517, -1.907, 0.089, -1.622, -1.337, 0.089, 0.945, 0.16, 0.16, -2.905, -1.123, -1.265, 0.802, 0.588, 1.8, 0.945, -0.481, 0.018, 1.515, -0.98, 0.945, 0.588, -0.053, 1.444, -0.125, -0.909, 1.016, 0.517, 0.374, 1.158, 0.945, -2.263, 0.659, 0.303, -0.053, -0.053, 0.802, 0.802, 0.303, -0.267, -2.62, -0.267, 0.802, 0.945, -1.123, -1.836, 0.018, 0.446, 0.303, 1.087, -0.196, 0.588, 0.232, -1.551, 0.446, -0.695, 0.303, 0.232, 0.802, -1.622, 0.303, -0.481, -1.408, 1.158, -1.693, -0.766, -1.836, 1.087, -0.053, -1.052, 1.372, -0.125, 0.588, 0.16, 0.659, 0.303, 1.016, -0.053, 0.517, 1.301, 0.873, 1.515, 0.089, -0.339, -0.339, -0.481, 1.943, 0.873, 0.446, -1.194, -0.553, -0.624, 0.873, -1.052, 0.731, -1.123, 0.16, -1.408, 1.515, 0.517, 0.446, -1.337, -0.553, 0.232, 0.303, 1.444, 0.873, 0.802, 1.301, -1.907, -0.838, 0.232, -1.052, 0.588, 1.372, 0.802, 1.016, 0.446, -0.98, -0.267, -1.123, 0.588, 0.303, 1.087, -0.267, 0.374, -1.764, 0.16, -0.909, 1.158, -1.408, 0.945, -0.196, -1.551, -0.624, -0.41, 0.446, -0.339, 1.158, 1.016, 1.372, 0.374, -0.909, -1.907, 0.802, 0.089, 0.588, -2.121, -0.267, 0.802, 1.729, 0.16, -0.624, 1.158, -1.622, 1.016, 0.731, 0.945, 0.089, 1.087, 1.23, -0.838, 1.016, 1.301, 1.301, -0.053, 0.232, -1.764, -0.481, 1.087, 0.873, -0.624, -2.05, 1.943, -0.41, -1.978, 0.018, -1.337, 1.586, 2.37, -0.481, -0.053, -1.194, -1.551, -1.693, 1.586, 0.659, 0.089, -0.481, 0.089, 0.802, -0.053, -1.622, 1.016, -0.909, 0.018, -0.624, 1.586, -0.766, -0.553, 0.588, -1.194, 0.16, -1.265, 1.016, -0.196, -0.41, 0.303, 0.303, -1.265, 0.873, -1.479, 0.303, -1.337, 0.802, 1.515, -0.053, 1.372, -1.836, 0.303, 0.588, 0.731, 1.158, 0.089, 0.731, 1.444, -0.909, -0.196, 1.016, -0.624, -1.978, -0.41, 1.23, 0.018, -0.125, -1.408, 1.087, -0.695, 1.301, 0.16, 1.016, -0.553, 0.945, -2.05, -0.053, -0.125, 0.873, -0.553, 0.731, 0.731, 0.659, -0.481, 0.659, -1.265)
fifaRaw <- c(fifaRaw, 0.089, 0.446, -0.838, -0.339, -1.337, 1.586, -2.05, 1.087, -1.194, 0.303, -0.624, -1.408, -1.123, -0.339, -0.909, -0.339, 1.301, -0.909, -1.337, 0.731, 0.018, -0.267, 0.446, 1.016, 0.802, 1.087, -1.194, -1.123, 0.659, 0.945, 0.802, 0.018, -1.479, 0.089, -0.766, 0.446, 0.303, 1.444, 0.802, -0.196, 0.873, 0.517, -0.339, 0.374, 1.016, 0.374, 1.729, 0.588, -1.551, 0.945, -0.766, -0.267, 0.873, -0.553, 1.372, -1.337, 0.873, -0.41, -1.337, -0.267, 0.16, 1.016, -1.337, -0.196, -0.053, 1.158, -1.408, -0.196, -0.553, 0.303, 0.802, -0.766, -0.267, -0.553, -0.41, -0.125, 0.374, -0.624, -0.41, 0.731, 0.232, 0.374, -1.052, -0.267, 0.374, 0.802, 0.731, -0.053, 0.945, -0.695, 0.446, -1.194, -0.339, -1.123, -1.693, 0.659, 1.301, -1.265, 0.659, 0.16, 1.23, 0.446, 0.089, 1.016, 1.586, 0.446, -0.196, -1.907, -0.98, 0.16, 0.303, -1.265, 0.303, -0.053, 0.945, 1.871, -0.766, -1.836, 1.026, 0.712, 0.712, -1.606, 0.086, -1.042, -0.854, 0.211, -0.165, 0.023, -0.039, 0.838, -0.29, 0.587, 0.336, 0.462, 2.154, 0.838, 0.838, -1.042, -1.543, -0.603, -1.167, -0.729, -0.729, -1.543, 1.088, -1.105, 1.339, 0.712, -0.353, -0.165, -0.165, -0.353, -1.731, 0.336, -0.541, -1.23, -0.917, 0.587, -0.478, 0.274, -0.791, -0.165, -0.039, 0.399, 0.712, -0.165, -0.979, 0.399, 1.527, 0.086, -0.165, -1.857, 0.65, -1.105, 0.023, -0.729, -0.729, 1.339, -0.039, -0.29, 1.715, 0.9, 0.211, 0.023, -0.666, -0.541, 0.274, 1.214, 1.026, 0.65, 0.524, 1.59, 0.462, -2.233, 0.274, 0.336, -0.102, 1.088, -0.165, -0.541, -0.227, -0.478, -1.481, 1.402, -0.854, -0.666, -0.854, 1.402, -0.039, -0.227, -0.039, 0.462, 1.214, 1.652, -0.478, 1.59, -0.165, 0.65, 0.9, -0.29, -0.039, -1.418, -1.293, -0.791, -0.541, 0.524, -0.729, 1.778, -2.358, -0.666, 0.462, -2.358, -0.791, -0.165, 0.712, 0.838, 1.026, 0.838, 0.712, 1.026, -1.355, 0.775, -2.045, -0.979, 0.023, 0.963, -0.165, -1.606, -0.729, 0.211, -0.039, 0.336, 0.963, -0.165, 0.399, -2.233, -0.666, -0.603, -0.227, -0.039, 0.336, 0.838, 0.65, -0.478, 0.9, -1.481, 0.838, 0.838, 0.775, 1.339, -0.227, -1.167, 0.023, 0.775, 0.399, 1.652, 0.211, -1.105, -0.165, 0.399, -0.415, -0.603, -1.606, 0.838, 1.402, -1.418, -1.543, -2.107, 0.65, 0.65, -0.478, -1.543, -0.791, 0.462, 0.086, -0.353, 0.086, 0.274, -0.227, -2.107, -0.039, -0.165, -0.353, 1.276, 1.276, -0.227, -0.478, 0.524, -0.979, 1.088, -2.233, -1.919, -0.478, 0.65, 1.402, -0.29, -0.478, -2.17, -1.418, 0.086, 0.587, -0.353, 1.339, 0.149, 0.963, 1.339, 1.088, 1.214, 1.214, -0.415, -0.165, 0.211, 1.966, 0.023, -1.042, -0.791, -0.541, -0.854, 0.838, -0.791, 0.524, 1.715, 1.84, -0.729, 0.838, 1.151, 0.149, -2.045, -0.415, -0.603, 1.151, 0.775, -1.794, 0.399, 1.464, -2.17, 0.086, 0.462, -1.355, -0.227, 1.402, -0.29, -0.039, 0.775, 0.274, 0.9, -0.729, 0.399, 0.211, -0.102, -1.105, 0.399, -0.039, 0.274, -0.478, 1.527, -2.233, -0.353, -0.102)
fifaRaw <- c(fifaRaw, -0.102, -1.982, 0.149, -0.102, -0.791, 0.65, 0.462, -0.039, 0.149, 0.462, -0.603, 1.088, -1.982, -0.102, -0.979, 0.524, 0.524, 1.276, -1.355, -0.791, 1.151, -1.481, 1.088, 1.778, 1.402, -0.791, 1.464, 2.216, -1.857, 0.587, 0.9, 1.84, 0.712, -2.233, -1.857, 1.214, 0.023, 0.023, 0.775, -1.669, 1.464, 0.149, -1.794, 0.023, -0.102, 1.966, 1.652, -0.729, -0.415, 0.399, -0.353, 0.086, 1.339, 0.023, -0.102, -0.102, -1.105, 1.026, 0.274, -0.666, 0.838, -1.105, 0.023, -0.227, 1.464, -0.039, -0.541, 1.402, -0.478, 0.9, -1.481, 0.211, 1.214, -1.857, -0.227, -0.478, -0.854, 0.587, -2.233, 0.65, 0.023, 0.775, 0.524, -0.917, 1.276, -0.854, -0.165, -1.293, -0.541, 0.712, -0.603, -0.165, 1.151, -1.418, -0.603, -0.165, -0.165, -1.606, 0.712, 1.464, 1.339, -0.227, -0.478, 1.464, -0.165, 0.462, 0.211, 1.214, 1.088, 1.339, -1.669, 0.9, -2.358, 0.023, -0.854, 0.211, 0.211, 1.276, -0.666, 1.214, -0.854, -0.415, -0.102, 0.524, -0.227, -1.543, 2.028, -0.917, 0.838, -0.917, -0.478, -0.478, -0.165, 0.023, 0.336, -0.603, -0.29, 0.211, -1.042, -1.543, 0.462, 0.211, -0.666, 0.023, 1.214, 0.65, 1.715, -1.731, -0.791, -0.415, 0.838, 0.399, 0.587, -0.478, 0.149, 0.9, 0.587, 1.026, 2.216, 0.336, 0.65, 1.088, 0.775, -1.982, -0.165, 1.088, 0.023, 1.026, 0.211, -0.415, -0.791, -1.418, 1.026, 1.652, 1.214, 0.211, -0.666, 0.712, -0.227, 0.838, -0.039, -0.29, -0.227, -0.165, -0.415, -0.541, 0.963, -1.418, -0.478, 0.462, -0.165, 0.963, -2.233, -1.481, 0.775, -1.23, 0.211, 1.402, -0.666, 0.65, 1.088, -0.165, 1.214, -0.165, -0.29, 1.276, -0.353, -1.669, 1.088, 0.336, -0.039, 0.524, -0.729, -1.042, -1.982, -2.045, 1.715, 1.652, -0.541, 0.963, -1.919, 1.402, -0.039, 0.149, 2.028, 1.088, -0.478, -1.105, -0.603, -0.478, 1.026, 0.462, -2.421, 1.276, -0.541, 0.086, 1.402, -0.478, -0.979, 0.801, 0.31, -0.673, -1.573, -0.918, 0.064, -0.263, 0.392, 0.31, 0.392, 0.146, 0.555, -0.263, 0.31, 1.292, -0.673, 1.702, 2.029, 0.555, -0.591, -1.655, 0.637, 0.064, -1.901, 0.064, -0.345, 0.637, 0.473, 1.128, 1.702, -0.837, -1.328, -0.345, -0.918, -0.837, 1.128, -0.018, 0.637, 0.228, 1.128, 0.883, 0.064, 0.555, 0.146, 0.637, 0.637, 0.473, 0.883, -0.182, -0.345, 0.392, -0.837, -0.018, -1.41, 0.473, -1.819, 0.473, -0.427, -1.655, 0.31, 0.473, 0.555, 0.31, 0.473, 0.064, -0.1, -0.018, -1.082, 0.146, 0.146, -0.182, 0.392, -0.018, 2.193, 0.146, -0.509, 0.473, -0.918, 0.965, 1.047, -0.263, 0.473, -0.182, -0.182, 0.064, 0.228, -1.328, -1.655, -1.41, 1.128, 1.21, 0.637, 0.555, 0.31, 0.146, 1.783, -1.164, 1.538, -0.345, 0.31, 0.146, -0.509, 0.228, -0.837, -0.427, -0.591, -0.1, 0.146, -0.345, 1.292, -2.638, 0.637, 1.538, -2.965, -0.918, 1.128, 0.555, -1.655, -0.1, 0.228, 0.719, 1.047, -1, 0.473, -1.082, -1.328, 0.883, 0.719, 1.538, -1.082, 0.392)
fifaRaw <- c(fifaRaw, -0.918, -0.427, -0.755, 1.865, 0.31, -0.263, -3.539, -1.819, 0.228, 0.801, 0.473, 1.947, 0.801, -1.328, -0.345, 0.883, -0.182, 0.883, -0.1, -1, 1.456, -0.182, -0.427, 1.62, 0.064, 0.801, 1.374, 0.555, 0.473, 0.228, 0.637, -0.263, 0.637, 0.31, 1.047, -0.755, -0.345, -3.702, -0.182, 0.555, 1.21, -1.246, -0.263, -0.837, 0.31, 0.31, -0.018, -0.591, 0.392, -0.182, -2.228, -0.755, 0.555, 0.228, -0.345, 0.555, -0.182, -1.737, -1.328, -0.018, 1.128, -1.737, -2.147, -1.246, 0.228, 0.392, -0.509, -0.1, -0.182, -0.591, -0.509, 0.31, 0.228, 1.374, -1.164, 0.146, 1.538, 0.965, 1.374, -0.673, 1.128, 0.31, 0.555, 1.374, 0.392, 0.555, -0.018, -1.41, -0.755, 0.637, -1.164, 0.064, -1.082, 1.538, -1.082, 0.31, 0.555, -0.018, -2.147, 0.31, 0.883, 0.31, 1.292, -0.1, 0.31, 1.292, -2.31, 0.555, -0.182, -0.263, -0.1, 1.047, -0.018, 0.228, -0.263, 1.047, -0.918, -1.655, -0.1, 0.473, 0.883, -0.755, 0.392, -0.1, -1.41, 0.883, 0.965, -2.147, 0.146, -1.328, -0.673, -2.147, -1.41, -0.263, 0.146, 0.392, 0.637, 1.62, -0.182, -0.427, 1.21, 1.292, -1.901, 0.31, -0.427, -1, 0.965, 1.538, -0.1, -0.182, 1.783, -0.673, 1.128, 0.637, 0.801, 1.702, 0.146, 1.702, -2.802, 0.31, 2.111, 1.783, -0.755, 0.31, -1.573, -1.246, 0.965, 1.128, -0.591, -1.655, 1.292, -0.755, -2.802, -0.427, -0.755, 0.637, 1.947, 0.228, -0.345, 1.21, 0.228, -0.018, 1.62, 0.719, -1.082, -1.246, 0.228, 0.392, -0.673, -0.509, 0.31, -1, 0.637, 1.292, 0.883, -1.41, 0.883, 0.719, -1, 0.228, -1.983, 0.473, -0.263, -2.638, -0.427, -0.673, -0.263, 0.637, -2.556, 0.473, -0.1, 0.883, 0.473, -0.182, 1.128, -0.673, 0.228, 0.31, -0.591, 1.62, -1.328, 0.392, 0.064, -0.837, -1.655, -0.1, 0.064, -0.182, -1.082, 0.883, 0.637, -0.755, 0.801, 0.31, -1.082, 1.047, 1.374, 1.047, 0.719, 0.064, -0.755, -0.755, -0.918, 0.555, -1.41, -0.755, 0.392, 0.801, -0.755, -0.509, -1.246, 1.128, -0.427, -1.082, -1.492, -2.802, 0.31, 1.21, 1.21, -1.41, 0.555, -0.1, -1.082, 0.146, 0.719, -0.263, 0.473, 0.473, -1.246, -2.392, 1.292, -0.1, -0.673, 0.801, 0.801, 0.392, 0.719, -2.883, 0.555, 0.31, -0.182, 0.883, 0.228, 0.555, 0.719, -0.837, 0.473, -0.755, 1.783, 0.637, -0.182, 0.473, 0.965, -1, -0.018, 0.719, 0.801, 1.374, -0.263, -1.655, 0.064, -0.018, -1.246, 1.047, 0.392, 1.456, 0.146, 0.555, 1.21, 0.555, -0.509, -0.345, 1.456, -1.082, 0.228, -0.427, 1.047, -0.673, -0.1, 0.555, 0.555, 1.047, 0.31, 0.555, -0.918, 0.392, -0.673, 0.883, -1.164, -0.591, 0.146, 0.392, -0.427, -0.591, -1.082, -0.1, 0.146, 0.801, -0.263, 0.801, -1.082, 0.392, -0.1, -0.345, -0.918, -2.72, 1.947, 0.473, -0.018, 0.883, 0.146, -0.018, -0.673, -0.018, 1.62, 1.456, -1.082, 0.228, -0.345, -0.755, 0.555, 0.555, -1.655, 1.128, -0.837, 0.965, 1.783, -0.182, -1.164, 0.232, -0.472, -0.522, -1.428, 0.232, -1.377, 0.836)
fifaRaw <- c(fifaRaw, -0.774, 0.283, 0.836, -0.22, -0.371, 0.484, 0.685, -0.874, -0.824, 0.937, 0.081, 0.937, 0.383, -1.88, 0.031, -1.981, -0.422, 0.534, -1.73, -0.623, 0.635, 0.635, 1.138, 0.383, 0.383, -0.12, 0.685, -1.88, -0.522, -1.428, 1.289, 0.987, 1.238, 1.289, 0.635, -1.528, 0.635, 1.037, -0.925, -1.025, 0.635, -1.78, -0.874, 1.138, -0.422, -0.371, -1.78, 0.534, -1.629, 1.037, 0.434, -1.78, -0.874, 0.283, 1.49, -0.723, 1.138, 0.434, -0.623, -0.573, 0.786, -0.371, 0.484, -1.227, -0.723, 0.031, 1.238, 1.037, -1.88, -0.17, 0.735, 1.037, 0.333, 1.238, 0.735, 0.735, -0.673, -1.478, -0.774, 0.031, 0.434, 0.484, 1.238, 1.289, 0.283, 0.635, 0.132, -0.573, -0.522, 0.434, 0.735, 1.188, -1.478, -1.176, 0.635, 0.584, -0.17, 0.584, 0.333, 0.081, 0.182, 1.087, 0.735, -1.629, 1.188, 1.037, -1.981, 0.735, 0.735, -0.12, -0.673, -1.579, -0.673, 0.685, 0.283, -0.975, 1.037, -1.277, 0.182, 1.138, 0.937, 1.44, -1.83, 0.232, 0.584, 0.735, -0.472, -0.623, 0.182, -1.377, -1.981, -0.371, 1.037, 1.037, 0.735, -0.019, 0.735, -0.774, 0.836, 0.534, -1.73, -0.422, -0.422, -0.925, -1.377, 0.987, 1.138, 1.44, 0.534, -1.176, 1.339, 0.735, 1.238, 0.685, -0.925, 0.132, 1.138, -1.528, 0.383, 0.031, -1.377, -1.78, -1.126, 0.836, -0.975, 0.333, -1.931, 0.685, 0.685, -0.673, 0.283, -0.774, 0.735, -1.327, -1.377, -0.623, 1.087, 0.232, -1.327, 0.685, 0.735, -0.774, 0.534, 1.138, -0.371, -2.082, -1.629, 0.383, 0.383, -0.12, 0.786, -0.22, -1.327, -1.327, -0.371, 0.534, 0.635, -0.925, 0.484, -1.73, 0.383, -0.271, 1.238, -1.377, 1.087, 1.087, 0.685, -1.126, -0.371, 0.685, 0.987, 0.584, 0.333, 0.987, 0.232, -0.12, 0.886, 1.339, 0.886, 0.182, -0.925, -0.975, -2.132, 0.836, 1.087, -0.573, 1.641, -1.73, 0.081, 0.434, -1.679, 1.087, -0.874, -1.83, 0.031, -0.774, 0.987, 1.037, 0.383, 0.786, -1.126, 0.182, -0.522, -1.377, 1.138, 0.836, 0.937, 1.138, -0.321, 1.289, 0.232, -1.931, 0.786, -0.17, 0.031, -1.78, -0.975, 0.937, 0.635, 0.484, 0.534, 1.641, 0.635, -1.126, 2.043, -0.522, -1.629, 0.232, 0.584, -1.428, -0.673, 0.283, -1.277, 0.685, 1.842, -1.528, -1.227, -1.277, 0.534, 1.289, 0.886, 0.584, -1.981, 0.735, 1.389, 0.987, -0.12, -1.227, -1.83, -0.472, 1.238, 1.037, -0.623, -1.428, 0.383, 0.534, -2.082, 0.383, 0.836, 0.534, -0.774, 0.685, 0.886, -0.925, 0.937, 0.836, 0.434, 1.037, -0.573, -0.925, 0.836, 0.635, -1.428, 0.584, -1.277, -1.78, -0.12, 1.339, -0.774, -0.371, 1.389, -1.126, 0.584, -0.975, -1.679, 0.685, -0.925, -1.377, 0.333, 0.584, 0.383, 0.031, -1.78, 0.937, 1.238, -0.12, -0.673, 1.087, -1.428, -0.07, 0.886, 1.289, -0.07, 1.238, 0.283, 0.836, 0.031, -1.88, -0.22, -0.07, 0.987, -1.88, -1.025, 0.534, 1.238, 0.333, 0.886, 0.836, 1.087, -0.019, 1.641, 0.937, -1.629, -0.07, -1.78, 0.383, -1.88, -0.12, 0.132, 0.333, 1.238, -0.824, -0.371)
fifaRaw <- c(fifaRaw, -0.422, 0.735, 0.937, -0.522, 0.886, 0.383, -1.629, 0.685, 1.44, -1.327, 0.383, 1.087, 0.735, -1.83, 1.49, 0.786, 0.937, 1.389, 0.886, 0.333, -1.428, 0.534, -0.371, 0.534, 0.635, 0.031, 0.484, 1.238, -1.377, 0.735, 1.087, -0.12, 0.786, -0.874, 0.987, 1.238, -0.975, 0.836, -0.12, 1.641, 0.635, 0.635, 0.383, -0.623, -1.579, 0.635, 0.132, 1.037, -0.774, -0.07, -1.931, 0.383, -1.83, -1.176, 0.886, -1.327, 1.087, 0.635, 1.339, 1.339, 0.886, 0.836, 0.534, 1.238, 0.484, 0.685, -1.327, 0.584, -1.629, 0.987, -0.422, 0.786, -0.321, -1.277, -1.83, -1.78, 0.383, 0.232, -0.472, -0.522, -1.277, -0.321, 1.188, -0.422, 0.635, 0.635, -0.371, 0.081, -1.478, -1.478, 1.289, -1.025, 0.836, 0.081, -1.83, -1.78, -1.428, 1.993, -0.874, 0.484, -0.019, 0.735, 0.635, -1.428, -0.17, -1.227, -1.126, 0.534, 0.735, 0.534, 0.081, -0.573, -0.22, -1.679, 1.138, 0.534, 1.188, 0.132, 0.434, 0.937, -1.551, -0.023, -1.506, -1.281, 0.382, -1.641, 1.011, -0.697, 0.292, 0.337, -0.292, -0.068, -0.337, 0.786, -0.517, -1.641, 0.921, 0.292, 0.427, 0.337, -1.371, 0.696, -1.461, 0.247, 0.651, -1.551, -0.292, 0.966, 0.831, 1.011, 0.606, 0.696, 0.786, 0.831, -1.551, -0.922, -1.012, 1.281, 0.966, 1.146, 1.236, 0.741, -1.416, 0.292, 0.921, -1.057, -0.832, 0.786, -1.326, -0.248, 1.325, -0.742, -0.292, -1.461, 0.606, -1.596, 1.056, 0.786, -1.686, -1.191, 0.516, 1.325, -1.551, 1.056, 0.292, 0.786, 0.067, 1.281, 0.067, 0.831, -0.967, -0.292, -0.787, 1.415, 0.921, -1.641, -0.337, 0.337, 0.921, -0.967, 0.921, 0.831, 0.831, 0.247, -1.371, -0.697, 0.472, 0.561, 0.516, 1.191, 1.236, 0.651, 0.561, 0.921, 0.067, -0.967, 0.831, -1.236, 0.876, -0.877, -1.461, 0.651, 0.786, 0.472, 0.831, 0.741, 0.561, -0.113, 0.966, 0.696, -1.731, 1.146, 1.325, -1.551, 0.876, 1.056, 0.247, -0.742, -1.101, -1.506, 0.741, 0.786, -1.506, 0.741, -1.641, 0.472, 1.146, 0.651, 1.191, -1.461, 0.696, 0.247, 0.741, -1.326, -0.877, 0.786, -0.922, -1.371, 0.382, 0.921, 1.101, 1.101, 0.561, 0.966, -0.877, 1.236, 0.921, -1.461, -0.472, -0.337, -1.506, -0.517, 1.011, 1.011, 1.236, -1.236, -1.101, 1.146, 0.696, 1.281, 0.696, -1.146, 0.831, 0.876, -1.596, -0.742, -1.012, -1.416, -1.596, -1.506, 0.696, -0.517, 0.876, -1.686, 0.741, 0.292, -1.236, 0.292, -0.607, 0.696, -0.877, -1.641, -0.697, 1.101, -0.472, -1.191, 0.876, 0.696, -0.517, 0.292, 1.011, 0.202, -1.596, -1.461, 0.741, 0.921, -1.101, 0.741, 1.011, -1.461, -1.461, -0.517, 0.516, 0.831, -0.697, 0.427, -1.596, 1.011, -0.832, 0.786, -1.146, 1.101, 1.146, 0.606, -1.057, 0.112, 0.876, 0.876, 0.472, 0.516, 0.966, 0.382, 0.292, 0.786, 1.325, 0.921, 0.022, -1.326, -1.281, -1.551, 0.561, 1.056, -1.461, 1.37, -1.236, -0.248, -0.158, -1.506, 1.146, -0.158, -1.506, 0.382, 0.112, 0.651, 1.011, -0.517, 0.921, -1.641, 0.696)
fifaRaw <- c(fifaRaw, -0.742, -1.506, 1.101, 1.011, 0.921, 0.966, -0.203, 1.101, 0.247, -1.596, 0.427, -1.146, 0.696, -1.551, 0.022, 0.696, 0.786, 0.561, 0.651, 1.46, 0.696, -1.461, 1.73, -1.146, -1.506, 0.516, 0.786, -1.461, -1.326, -0.382, -1.641, 0.561, 0.876, -1.641, -0.967, -0.472, 0.157, 1.46, 0.966, -0.832, -1.641, 1.056, 1.191, 1.191, -1.012, -1.551, -1.506, -0.922, 1.281, 1.325, -0.967, -1.461, 0.651, 0.472, -1.641, 0.651, 0.921, -0.697, -0.517, 1.011, 0.831, -1.326, 0.921, 0.696, 0.651, 1.011, -0.877, -1.686, 0.966, 0.786, -1.057, 0.606, -1.191, -1.641, 0.516, 1.281, -0.158, -0.292, 1.46, -1.146, 0.741, -1.416, -1.416, -0.967, -0.832, -1.551, 0.831, 0.606, 0.786, 0.561, -1.641, 0.831, 1.056, 0.292, -0.877, 1.191, -1.191, 0.472, 0.786, 1.101, 0.561, 1.056, 0.382, 0.561, -0.472, -1.506, -0.158, 0.337, 0.786, -1.641, -1.371, 0.831, 1.146, 0.561, 1.146, 0.472, 0.382, -1.281, 1.64, 0.741, -1.236, -0.292, -1.416, -0.967, -1.596, -0.068, 0.247, 0.247, 1.011, -0.697, -0.742, -1.146, 0.651, 0.786, -0.248, 0.606, 0.472, -1.551, 0.516, 1.37, -0.787, 0.202, 1.281, 0.741, -1.596, 1.37, 1.146, 0.606, 1.415, 1.236, 0.022, -1.416, -0.697, 0.022, 0.651, 0.831, 0.337, 0.247, 1.325, -1.506, 0.651, 0.876, 0.651, 0.651, -1.551, 1.056, 1.191, -1.012, 0.651, -1.326, 1.37, 0.831, 0.651, -0.607, -0.652, -1.461, 0.472, -0.113, 0.741, -0.248, -0.158, -1.506, 0.651, -1.686, -1.236, 0.876, -1.101, 1.191, 0.741, 1.146, 1.011, 0.786, 0.831, 0.651, 1.46, 0.606, 0.606, -1.281, 0.921, -1.506, 0.831, -1.506, 0.966, -0.742, -1.146, -1.461, -1.326, 0.292, 0.651, -0.562, -0.292, -1.461, -0.652, 0.651, -1.101, 0.876, -0.248, -0.113, 0.112, -1.596, -1.012, 1.236, -0.922, 0.831, 0.516, -1.012, -1.461, -1.506, 2, -0.248, 0.606, -0.697, 0.651, 0.561, -1.236, -0.203, -1.146, -1.057, 0.921, 1.056, 0.921, 0.337, -0.562, -1.371, -1.686, 1.37, -0.248, 1.011, -0.113, 0.696, 0.516, -1.516, 0.053, -1.331, -1.377, 0.837, -1.562, 0.976, 0.099, 0.376, 0.56, 0.145, -0.685, -0.731, 0.699, -0.593, -1.608, 0.929, -0.316, 0.237, 0.237, -1.47, 0.791, -1.377, 0.053, 0.837, -1.423, 0.791, 1.022, 0.653, 0.791, 0.468, 0.745, 0.883, 0.699, -1.285, -0.962, -1.1, 1.206, 0.883, 0.976, 1.252, 0.422, -1.562, 0.883, 1.114, -1.1, -1.1, 0.653, -1.562, -0.778, 1.483, -0.778, 0.007, -1.562, 0.33, -1.377, 1.252, 0.745, -1.654, -1.562, 0.653, 1.621, -1.331, 1.16, -0.039, 0.33, -0.178, 0.976, -0.501, 0.745, -0.962, -0.501, -1.054, 1.068, 0.791, -1.562, -0.039, 0.468, 1.022, -1.562, 0.883, 0.837, 1.022, 0.468, -1.47, -0.731, 0.237, 0.33, 0.468, 1.206, 1.252, 0.653, 0.837, 0.929, 0.191, -1.193, 0.883, -1.147, 0.929, -0.916, -1.285, 0.653, 0.791, 0.468, 0.976, 0.468, 0.56, -0.178, 0.929, 0.284, -1.516, 1.022, 1.298, -1.47, 0.699, 1.252, 0.007, -1.147, -1.285)
fifaRaw <- c(fifaRaw, -1.423, 0.745, 0.653, -1.516, 0.791, -1.562, 0.284, 1.206, 0.422, 1.16, -1.239, 0.606, 0.284, 0.929, -1.516, -1.008, 0.699, -0.778, -1.377, 0.376, 0.929, 1.114, 0.791, 0.376, 1.022, -1.008, 1.252, 0.976, -1.608, -0.455, -0.316, -1.147, -1.147, 1.022, 0.976, 0.837, -1.054, -0.962, 0.883, 0.653, 1.298, 0.745, -1.193, 0.883, 0.929, -1.562, -0.731, -0.87, -1.47, -1.47, -1.562, 0.653, -0.547, 0.791, -1.654, 0.791, 0.284, -1.285, 0.284, -0.824, 0.745, -0.316, -1.47, -0.593, 1.114, -0.501, -1.47, 1.068, 0.745, -0.224, 0.284, 1.068, 0.237, -1.47, -1.331, 0.791, 0.837, -1.1, 0.883, 0.33, -1.516, -1.423, -0.27, 0.237, 0.791, -0.87, 0.468, -1.516, 0.883, -0.824, 0.791, -1.331, 1.068, 1.391, 0.699, -0.916, 0.099, 1.022, 0.929, 0.653, 0.56, 0.929, 0.56, -0.132, 0.745, 1.345, 0.976, -0.362, -1.377, -1.193, -1.47, 0.699, 1.16, -1.193, 1.298, -1.516, -0.039, -0.178, -1.562, 1.252, -0.27, -1.47, 0.237, 0.33, 0.422, 0.837, 0.145, 1.114, -1.516, 0.653, -0.731, -1.562, 1.068, 1.068, 0.976, 0.837, -0.962, 1.114, -0.316, -1.562, 0.33, -0.962, 0.514, -1.47, -0.132, 0.468, 0.791, 0.376, 0.422, 1.391, 0.745, -1.331, 1.898, -1.193, -1.654, 0.422, 0.745, -1.377, -1.423, -0.639, -1.608, 0.653, 0.699, -1.562, -1.054, -0.87, 0.007, 1.483, 0.883, -1.1, -1.47, 0.653, 0.929, 1.206, -0.962, -1.331, -1.654, -1.331, 1.252, 1.206, -0.87, -1.239, -0.178, 0.376, -1.654, 0.745, 0.883, -0.916, -0.316, 0.929, 0.929, -1.608, 0.837, 0.745, 0.929, 0.976, -0.778, -1.147, 1.16, 0.653, -1.1, 0.653, -1.377, -1.285, 0.653, 1.16, -0.501, 0.053, 1.714, -0.87, 0.56, -1.239, -1.423, -1.054, -0.639, -1.608, 0.791, 0.976, 0.883, 0.468, -1.516, 0.883, 1.252, 0.237, -0.408, 0.699, -1.377, 0.237, 0.929, 1.114, 0.376, 1.252, 0.237, 0.56, -0.824, -1.423, -0.178, 0.376, 0.929, -1.608, -1.285, 0.883, 1.16, 0.468, 1.114, 0.468, 0.653, -0.87, 1.714, 0.606, -1.377, -0.316, -1.193, -0.87, -1.516, -0.178, 0.376, 0.237, 1.206, -0.685, -0.778, -1.147, 0.883, 0.791, 0.053, 0.699, 0.514, -1.193, 0.376, 1.298, -0.962, 0.284, 1.114, 0.791, -1.654, 1.391, 1.114, 1.391, 1.76, 0.606, 0.33, -1.47, -0.824, 0.56, 0.791, 0.929, 0.33, 0.237, 1.345, -1.423, 0.606, 1.022, 0.606, 0.883, -1.285, 0.883, 1.252, -1.193, 0.883, -1.193, 1.483, 0.883, 0.699, 0.284, -0.639, -1.377, 0.33, -0.455, 0.929, -0.547, 0.099, -1.654, 0.699, -1.562, -1.377, 0.883, -1.193, 1.114, 0.837, 1.298, 1.022, 0.929, 0.745, 0.653, 0.883, 0.56, 0.653, -0.962, 0.837, -1.562, 0.837, -1.285, 0.976, -1.147, -1.423, -1.239, -1.331, 0.791, 0.883, -0.547, 0.007, -1.423, -0.962, 0.699, -1.377, 0.837, 0.422, -0.962, 0.099, -1.47, -1.008, 1.298, -1.054, 0.791, 0.699, -0.962, -1.608, -1.193, 2.083, -0.132, 0.56, 0.053, 0.699, 0.606, -1.008, -0.039, -1.239, -1.1, 0.791, 0.699, 1.022, 0.191, -0.593, -1.516, -1.423)
fifaRaw <- c(fifaRaw, 1.391, 0.053, 1.022, 0.284, 0.56, 0.745, -0.156, -0.523, -0.418, 2.156, -0.261, 3.364, -0.628, -0.576, -0.156, -0.523, -0.628, -0.471, -0.366, -0.261, -0.313, -0.576, -0.156, -0.576, -0.471, -0.208, 1.998, -0.366, 2.366, -0.418, -0.366, 2.313, -0.208, -0.628, -0.471, -0.313, -0.471, -0.523, -0.261, -0.156, 3.206, -0.471, -0.366, -0.366, -0.208, -0.471, -0.313, -0.261, 2.523, -0.313, -0.471, -0.313, -0.576, -0.523, 2.628, -0.681, -0.156, -0.523, -0.366, 2.996, -0.418, 2.838, -0.261, -0.523, 1.893, -0.471, -0.366, -0.156, -0.418, -0.313, -0.208, -0.681, -0.628, -0.313, -0.366, -0.156, -0.628, -0.418, -0.208, -0.418, -0.156, 2.733, -0.628, -0.156, -0.208, -0.156, -0.418, -0.576, -0.576, -0.313, 3.101, -0.103, -0.103, -0.261, -0.261, -0.366, -0.576, -0.261, -0.261, -0.366, -0.156, -0.261, -0.418, -0.418, -0.261, -0.261, -0.418, -0.628, -0.208, -0.208, -0.208, -0.156, -0.576, -0.261, -0.366, -0.576, 2.051, -0.156, -0.576, 1.946, -0.261, -0.418, -0.523, -0.261, -0.523, -0.156, -0.208, -0.471, 2.628, -0.471, 2.523, -0.523, -0.103, -0.576, -0.156, 2.313, -0.523, -0.681, -0.523, -0.471, -0.261, -0.103, -0.208, 2.103, -0.366, -0.208, -0.628, -0.471, -0.313, -0.103, -0.261, -0.261, -0.628, 3.101, -0.471, -0.366, -0.313, -0.576, -0.156, -0.313, -0.576, -0.156, -0.261, -0.103, -0.523, -0.208, -0.576, -0.628, -0.523, -0.366, 2.681, -0.471, -0.576, 2.733, 2.628, 2.523, -0.576, -0.366, -0.261, 2.418, -0.418, -0.418, -0.628, -0.523, -0.418, -0.156, -0.418, 1.42, -0.628, -0.366, -0.156, -0.208, -0.628, -0.103, -0.261, -0.156, -0.576, -0.208, 1.525, 2.366, -0.628, -0.418, -0.576, -0.208, -0.261, 2.576, 2.733, -0.628, -0.628, -0.208, -0.471, -0.681, -0.576, -0.366, -0.103, -0.523, -0.208, -0.628, -0.576, -0.261, -0.208, -0.523, -0.261, -0.313, -0.366, -0.313, -0.471, -0.208, -0.576, -0.576, -0.471, -0.418, -0.576, -0.313, -0.261, 2.261, -0.156, -0.628, -0.418, -0.313, 2.733, -0.576, -0.576, 2.313, -0.471, -0.576, 2.576, -0.418, -0.208, -0.576, -0.103, -0.208, -0.261, -0.208, -0.366, -0.156, -0.523, -0.208, -0.261, -0.208, -0.576, -0.261, -0.628, -0.208, 1.735, -0.471, -0.208, -0.628, 2.418, -0.523, -0.471, -0.418, -0.418, -0.103, -0.523, -0.418, -0.523, -0.471, -0.156, 3.154, -0.208, -0.576, -0.261, -0.418, -0.208, 2.103, -0.418, -0.471, 2.523, -0.576, -0.261, -0.523, -0.523, -0.523, -0.103, 2.313, -0.313, -0.523, -0.681, -0.523, 2.208, 2.366, -0.628, -0.576, -0.471, -0.366, 1.893, -0.261, -0.628, 2.523, -0.208, -0.156, -0.208, -0.366, -0.523, -0.628, -0.313, -0.208, -0.523, -0.208, -0.471, -0.576, -0.313, -0.261, -0.576, -0.313, -0.523, -0.156, 3.049, -0.261, -0.261, -0.576, -0.418, -0.313, -0.523, -0.471, -0.418, 2.628, -0.628, -0.471, 2.681, -0.523, -0.628, -0.418, -0.208, 2.156, -0.366, -0.261, -0.261, -0.471, -0.471, -0.313, -0.523, -0.471, -0.208, -0.208, -0.576, -0.156, -0.418, -0.523)
fifaRaw <- c(fifaRaw, 2.261, -0.366, -0.261, -0.103, 2.576, -0.103, -0.628, -0.418, -0.208, -0.208, -0.418, -0.471, -0.523, -0.261, -0.471, -0.418, -0.681, 2.786, -0.523, 2.471, -0.366, -0.628, -0.208, -0.471, -0.366, -0.261, -0.156, -0.208, -0.366, -0.681, -0.208, -0.471, 2.838, -0.523, -0.523, -0.208, -0.156, -0.366, -0.103, 2.156, -0.208, -0.208, -0.366, -0.156, -0.628, -0.523, 1.998, -0.576, -0.681, -0.418, -0.366, -0.418, -0.103, -0.103, 2.944, -0.576, -0.471, -0.366, -0.208, -0.366, -0.628, -0.681, -0.208, -0.628, -0.523, -0.208, -0.471, -0.628, -0.366, -0.418, 1.998, -0.523, -0.628, -0.681, -0.681, -0.261, 2.051, -0.208, 2.628, -0.418, -0.681, -0.471, -0.156, -0.208, -0.103, -0.418, -0.576, -0.261, -0.576, -0.576, -0.366, -0.523, -0.366, -0.156, 2.523, -0.471, -0.313, -0.313, -0.313, 3.101, 2.733, -0.208, -0.156, -0.523, -0.156, -0.471, -0.156, -0.366, -0.471, -0.576, -0.576, -0.628, -0.208, -0.576, 2.838, -0.681, -0.313, -0.366, -0.103, -0.208, 2.261, 2.681, 2.733, -0.366, -0.628, -0.261, -0.313, -0.313, -0.313, -0.471, -0.103, -0.313, 0.002, -0.313, -0.471, -0.156, -0.418, -0.156, -0.313, 2.681, -0.156, -0.523, -0.576, -0.418, -0.313, -0.523, -0.485, -0.323, -0.216, 1.987, -0.592, 3.115, -0.216, -0.538, -0.108, -0.699, -0.377, -0.538, -0.108, -0.485, -0.162, -0.216, -0.485, -0.323, -0.592, -0.162, 2.095, -0.538, 2.363, -0.431, -0.377, 2.256, -0.216, -0.377, -0.377, -0.27, -0.27, -0.646, -0.162, -0.162, 3.008, -0.485, -0.431, -0.162, -0.538, -0.485, -0.699, -0.323, 2.632, -0.699, -0.323, -0.592, -0.538, -0.216, 2.686, -0.699, -0.323, -0.592, -0.538, 2.739, -0.27, 2.793, -0.323, -0.699, 2.095, -0.162, -0.485, -0.377, -0.323, -0.377, -0.592, -0.323, -0.485, -0.162, -0.485, -0.592, -0.646, -0.377, -0.323, -0.377, -0.27, 2.578, -0.431, -0.162, -0.108, -0.485, -0.108, -0.592, -0.485, -0.538, 3.008, -0.162, -0.216, -0.431, -0.646, -0.108, -0.485, -0.431, -0.27, -0.431, -0.377, -0.216, -0.646, -0.162, -0.485, -0.431, -0.538, -0.431, -0.646, -0.485, -0.538, -0.377, -0.162, -0.162, -0.216, -0.162, 1.665, -0.216, -0.27, 1.826, -0.485, -0.216, -0.538, -0.538, -0.323, -0.162, -0.592, -0.108, 2.9, -0.216, 2.739, -0.323, -0.162, -0.216, -0.27, 2.471, -0.485, -0.162, -0.592, -0.592, -0.323, -0.485, -0.216, 1.88, -0.323, -0.216, -0.377, -0.323, -0.431, -0.162, -0.431, -0.216, -0.216, 2.954, -0.538, -0.216, -0.216, -0.377, -0.377, -0.27, -0.377, -0.27, -0.377, -0.216, -0.699, -0.646, -0.216, -0.485, -0.162, -0.431, 2.686, -0.431, -0.485, 2.847, 2.632, 2.847, -0.162, -0.323, -0.699, 2.202, -0.162, -0.592, -0.323, -0.377, -0.162, -0.216, -0.431, 2.202, -0.592, -0.27, -0.377, -0.216, -0.431, -0.27, -0.377, -0.162, -0.646, -0.27, 1.933, 2.578, -0.485, -0.323, -0.27, -0.216, -0.216, 2.471)
fifaRaw <- c(fifaRaw, 2.847, -0.646, -0.377, -0.377, -0.538, -0.377, -0.592, -0.216, -0.431, -0.27, -0.646, -0.538, -0.27, -0.108, -0.538, -0.27, -0.323, -0.538, -0.323, -0.485, -0.431, -0.592, -0.538, -0.485, -0.108, -0.216, -0.485, -0.592, -0.27, 2.256, -0.377, -0.323, -0.377, -0.323, 2.9, -0.538, -0.592, 2.202, -0.27, -0.538, 2.632, -0.27, -0.538, -0.27, -0.485, -0.377, -0.431, -0.27, -0.538, -0.216, -0.592, -0.162, -0.27, -0.538, -0.108, -0.216, -0.162, -0.162, 1.718, -0.162, -0.27, -0.162, 2.417, -0.431, -0.485, -0.485, -0.216, -0.323, -0.27, -0.592, -0.323, -0.377, -0.377, 2.632, -0.538, -0.646, -0.485, -0.485, -0.485, 2.686, -0.162, -0.108, 2.686, -0.646, -0.323, -0.377, -0.485, -0.592, -0.323, 2.309, -0.323, -0.323, -0.592, -0.431, 2.202, 2.202, -0.485, -0.377, -0.377, -0.216, 2.363, -0.377, -0.485, 2.148, -0.485, -0.323, -0.592, -0.162, -0.27, -0.485, -0.377, -0.646, -0.538, -0.377, -0.646, -0.216, -0.646, -0.592, -0.538, -0.323, -0.108, -0.431, 2.847, -0.592, -0.162, -0.485, -0.27, -0.108, -0.377, -0.431, -0.377, 3.115, -0.592, -0.323, 2.686, -0.646, -0.162, -0.538, -0.323, 1.826, -0.216, -0.592, -0.323, -0.431, -0.485, -0.162, -0.431, -0.108, -0.485, -0.216, -0.27, -0.538, -0.162, -0.377, 2.417, -0.592, -0.431, -0.27, 2.524, -0.538, -0.216, -0.27, -0.699, -0.646, -0.323, -0.431, -0.323, -0.377, -0.162, -0.216, -0.323, 2.524, -0.431, 2.578, -0.592, -0.431, -0.216, -0.108, -0.162, -0.485, -0.592, -0.216, -0.431, -0.377, -0.592, -0.377, 2.847, -0.592, -0.592, -0.538, -0.646, -0.323, -0.431, 2.471, -0.27, -0.377, -0.699, -0.323, -0.538, -0.592, 1.826, -0.699, -0.323, -0.592, -0.538, -0.485, -0.377, -0.323, 2.632, -0.592, -0.485, -0.646, -0.377, -0.485, -0.377, -0.538, -0.162, -0.646, -0.538, -0.27, -0.538, -0.592, -0.108, -0.216, 2.041, -0.162, -0.27, -0.377, -0.538, -0.108, 2.095, -0.592, 2.739, -0.646, -0.162, -0.323, -0.431, -0.27, -0.108, -0.108, -0.323, -0.377, -0.431, -0.377, -0.538, -0.538, -0.646, -0.592, 2.739, -0.108, -0.216, -0.108, -0.323, 3.115, 2.739, -0.377, -0.323, -0.216, -0.431, -0.431, -0.162, -0.646, -0.162, -0.646, -0.431, -0.431, -0.699, -0.431, 2.686, -0.162, -0.162, -0.646, -0.162, -0.108, 2.524, 2.9, 1.826, -0.538, -0.216, -0.485, -0.592, -0.162, -0.485, -0.377, -0.162, -0.592, -0.108, -0.431, -0.216, -0.699, -0.485, -0.27, -0.485, 2.471, -0.431, -0.485, -0.323, -0.538, -0.377, -0.162, -0.525, -0.25, -0.25, 1.999, -0.36, 2.328, -0.525, -0.47, -0.47, -0.25, -0.47, -0.305, -0.36, -0.579, -0.195, -0.579, -0.14, -0.14, -0.579, -0.305, 1.999, -0.525, 2.493, -0.305, -0.634, 2.219, -0.525, -0.525, -0.086, -0.525, -0.305, -0.47, -0.415, -0.525, 2.822, -0.47, -0.415, -0.14, -0.086, -0.36, -0.36, -0.415, 2.658, -0.634, -0.634, -0.525, -0.305, -0.086, 2.658, -0.579, -0.195, -0.086, -0.47)
fifaRaw <- c(fifaRaw, 3.371, -0.579, 3.042, -0.086, -0.36, 2.054, -0.36, -0.525, -0.25, -0.25, -0.634, -0.579, -0.634, -0.305, -0.14, -0.47, -0.415, -0.525, -0.415, -0.634, -0.25, -0.525, 3.316, -0.36, -0.25, -0.25, -0.525, -0.195, -0.47, -0.579, -0.579, 2.767, -0.47, -0.25, -0.525, -0.47, -0.525, -0.579, -0.634, -0.525, -0.086, -0.14, -0.579, -0.47, -0.14, -0.36, -0.579, -0.579, -0.525, -0.579, -0.195, -0.305, -0.305, -0.195, -0.415, -0.525, -0.47, 2.548, -0.689, -0.36, 1.999, -0.525, -0.579, -0.634, -0.47, -0.47, -0.47, -0.579, -0.14, 2.383, -0.579, 2.658, -0.305, -0.25, -0.25, -0.14, 2.822, -0.305, -0.14, -0.47, -0.579, -0.36, -0.25, -0.195, 1.89, -0.525, -0.47, -0.47, -0.25, -0.47, -0.305, -0.305, -0.195, -0.415, 3.206, -0.086, -0.634, -0.25, -0.525, -0.086, -0.579, -0.634, -0.47, -0.415, -0.25, -0.634, -0.14, -0.36, -0.195, -0.525, -0.305, 3.206, -0.305, -0.305, 2.822, 2.328, 2.767, -0.086, -0.25, -0.195, 2.274, -0.525, -0.195, -0.525, -0.579, -0.415, -0.25, -0.195, 1.56, -0.634, -0.415, -0.36, -0.525, -0.14, -0.415, -0.47, -0.195, -0.579, -0.305, 2.219, 2.548, -0.579, -0.195, -0.305, -0.415, -0.47, 2.822, 3.097, -0.195, -0.195, -0.305, -0.086, -0.525, -0.579, -0.47, -0.634, -0.36, -0.689, -0.525, -0.634, -0.36, -0.525, -0.525, -0.47, -0.305, -0.525, -0.305, -0.415, -0.415, -0.195, -0.086, -0.525, -0.305, -0.525, -0.305, -0.525, 1.89, -0.415, -0.47, -0.195, -0.36, 2.658, -0.415, -0.579, 2.219, -0.47, -0.195, 2.274, -0.36, -0.195, -0.086, -0.36, -0.195, -0.195, -0.14, -0.25, -0.579, -0.195, -0.525, -0.36, -0.47, -0.634, -0.47, -0.415, -0.36, 1.944, -0.086, -0.634, -0.25, 2.328, -0.525, -0.305, -0.47, -0.25, -0.525, -0.47, -0.305, -0.36, -0.579, -0.579, 2.767, -0.305, -0.195, -0.525, -0.579, -0.634, 2.438, -0.25, -0.634, 2.438, -0.36, -0.415, -0.195, -0.689, -0.14, -0.25, 2.493, -0.25, -0.305, -0.195, -0.25, 2.493, 2.219, -0.36, -0.47, -0.195, -0.25, 2.658, -0.25, -0.25, 1.725, -0.305, -0.14, -0.525, -0.47, -0.36, -0.36, -0.634, -0.525, -0.689, -0.195, -0.579, -0.36, -0.525, -0.305, -0.415, -0.579, -0.579, -0.195, 2.603, -0.25, -0.579, -0.415, -0.579, -0.36, -0.086, -0.525, -0.086, 2.164, -0.36, -0.195, 2.603, -0.579, -0.305, -0.36, -0.47, 1.999, -0.14, -0.36, -0.579, -0.47, -0.195, -0.579, -0.689, -0.47, -0.14, -0.634, -0.36, -0.634, -0.579, -0.525, 2.548, -0.36, -0.634, -0.305, 2.713, -0.579, -0.305, -0.305, -0.634, -0.195, -0.25, -0.36, -0.36, -0.47, -0.579, -0.36, -0.25, 2.658, -0.525, 2.713, -0.195, -0.25, -0.525, -0.195, -0.305, -0.195, -0.25, -0.36, -0.14, -0.579, -0.579, -0.634, 2.658, -0.47, -0.415, -0.14, -0.634, -0.195, -0.14, 2.493, -0.415, -0.47, -0.525, -0.25, -0.47, -0.634, 1.725, -0.415, -0.305, -0.415, -0.634, -0.634, -0.25, -0.14, 2.328, -0.195, -0.47, -0.086, -0.14, -0.47, -0.195)
fifaRaw <- c(fifaRaw, -0.689, -0.634, -0.086, -0.305, -0.195, -0.36, -0.305, -0.525, -0.579, 2.219, -0.25, -0.47, -0.579, -0.634, -0.086, 2.109, -0.195, 1.56, -0.36, -0.195, -0.47, -0.195, -0.14, -0.195, -0.086, -0.47, -0.36, -0.525, -0.14, -0.25, -0.36, -0.579, -0.579, 2.603, -0.47, -0.47, -0.579, -0.525, 2.767, 3.261, -0.36, -0.525, -0.305, -0.305, -0.47, -0.195, -0.25, -0.634, -0.47, -0.305, -0.305, -0.525, -0.305, 2.438, -0.634, -0.305, -0.579, -0.086, 0.683, 2.438, 2.603, 2.713, -0.47, -0.525, -0.25, -0.195, -0.634, -0.634, -0.305, -0.195, -0.47, -0.25, -0.415, -0.14, -0.305, -0.579, -0.25, -0.36, 2.713, -0.14, -0.305, -0.195, -0.47, -0.305, -0.25, -0.567, -0.356, -0.144, 2.074, -0.408, 3.078, -0.144, -0.62, -0.197, -0.408, -0.461, -0.356, -0.514, -0.408, -0.514, -0.461, -0.197, -0.461, -0.197, -0.144, 2.022, -0.197, 2.391, -0.303, -0.514, 2.339, -0.567, -0.461, -0.62, -0.408, -0.197, -0.25, -0.144, -0.25, 3.237, -0.303, -0.567, -0.303, -0.408, -0.514, -0.197, -0.25, 2.603, -0.25, -0.356, -0.408, -0.567, -0.197, 2.708, -0.62, -0.144, -0.514, -0.356, 2.761, -0.356, 2.708, -0.567, -0.514, 2.497, -0.567, -0.091, -0.356, -0.567, -0.144, -0.197, -0.144, -0.197, -0.408, -0.62, -0.303, -0.197, -0.144, -0.567, -0.567, -0.514, 2.708, -0.408, -0.356, -0.62, -0.356, -0.567, -0.567, -0.356, -0.408, 2.92, -0.091, -0.197, -0.356, -0.408, -0.461, -0.25, -0.514, -0.303, -0.144, -0.144, -0.408, -0.673, -0.25, -0.303, -0.303, -0.408, -0.567, -0.25, -0.25, -0.197, -0.197, -0.303, -0.514, -0.514, -0.197, 2.286, -0.25, -0.62, 1.81, -0.567, -0.514, -0.356, -0.303, -0.25, -0.25, -0.091, -0.356, 2.603, -0.567, 2.814, -0.25, -0.567, -0.356, -0.514, 2.444, -0.197, -0.303, -0.567, -0.514, -0.567, -0.25, -0.356, 2.603, -0.356, -0.303, -0.408, -0.197, -0.673, -0.25, -0.62, -0.303, -0.25, 3.025, -0.303, -0.62, -0.461, -0.461, -0.514, -0.567, -0.514, -0.461, -0.514, -0.356, -0.514, -0.514, -0.144, -0.567, -0.303, -0.144, 2.497, -0.303, -0.461, 2.603, 2.656, 2.814, -0.091, -0.303, -0.567, 2.074, -0.514, -0.091, -0.461, -0.356, -0.197, -0.303, -0.567, 1.388, -0.673, -0.144, -0.461, -0.567, -0.25, -0.567, -0.461, -0.144, -0.567, -0.197, 1.757, 2.444, -0.303, -0.461, -0.356, -0.197, -0.62, 2.391, 2.867, -0.25, -0.25, -0.144, -0.62, -0.62, -0.25, -0.197, -0.197, -0.514, -0.303, -0.673, -0.461, -0.197, -0.408, -0.514, -0.673, -0.461, -0.303, -0.408, -0.408, -0.408, -0.567, -0.303, -0.514, -0.197, -0.144, -0.197, -0.62, 2.127, -0.461, -0.567, -0.303, -0.144, 2.814, -0.461, -0.356, 2.127, -0.62, -0.408, 2.286, -0.461, -0.62, -0.461, -0.408, -0.144, -0.408, -0.303, -0.25, -0.25, -0.197, -0.197, -0.303, -0.144, -0.408, -0.673, -0.461, -0.197, 1.599, -0.461, -0.62, -0.303, 2.55, -0.197, -0.514, -0.303, -0.514, -0.62, -0.62, -0.62, -0.673)
fifaRaw <- c(fifaRaw, -0.408, -0.567, 2.603, -0.197, -0.144, -0.356, -0.25, -0.62, 3.025, -0.62, -0.567, 2.708, -0.62, -0.144, -0.197, -0.567, -0.25, -0.197, 2.708, -0.197, -0.408, -0.673, -0.303, 2.391, 2.233, -0.197, -0.303, -0.461, -0.62, 2.497, -0.303, -0.408, 2.444, -0.408, -0.567, -0.25, -0.197, -0.303, -0.25, -0.408, -0.514, -0.408, -0.197, -0.356, -0.408, -0.408, -0.25, -0.62, -0.567, -0.567, -0.25, 2.761, -0.461, -0.197, -0.514, -0.356, -0.25, -0.461, -0.303, -0.408, 2.814, -0.303, -0.673, 2.55, -0.356, -0.356, -0.144, -0.356, 1.863, -0.091, -0.567, -0.514, -0.567, -0.303, -0.514, -0.197, -0.144, -0.408, -0.514, -0.62, -0.303, -0.62, -0.144, 2.339, -0.62, -0.62, -0.408, 3.025, -0.567, -0.197, -0.514, -0.303, -0.567, -0.356, -0.25, -0.461, -0.356, -0.62, -0.356, -0.408, 2.603, -0.62, 2.391, -0.356, -0.461, -0.144, -0.356, -0.673, -0.303, -0.461, -0.303, -0.567, -0.303, -0.25, -0.673, 2.761, -0.62, -0.408, -0.303, -0.62, -0.461, -0.567, 2.127, -0.303, -0.25, -0.673, -0.197, -0.408, -0.144, 1.652, -0.514, -0.567, -0.303, -0.303, -0.514, -0.408, -0.197, 2.444, -0.197, -0.144, -0.303, -0.461, -0.303, -0.144, -0.25, -0.62, -0.408, -0.197, -0.197, -0.144, -0.567, -0.197, -0.567, 2.074, -0.303, -0.144, -0.356, -0.25, -0.303, 1.863, -0.567, 2.761, -0.62, -0.62, -0.25, -0.356, -0.514, -0.514, -0.197, -0.197, -0.567, -0.567, -0.461, -0.461, -0.408, -0.408, -0.408, 2.656, -0.356, -0.567, -0.567, -0.356, 3.237, 2.761, -0.567, -0.303, -0.461, -0.303, -0.356, -0.144, -0.144, -0.461, -0.356, -0.567, -0.62, -0.514, -0.356, 2.814, -0.356, -0.356, -0.25, -0.567, -0.197, 2.286, 2.603, 2.603, -0.567, -0.461, -0.356, -0.461, -0.303, -0.091, -0.514, -0.514, -0.144, -0.144, -0.197, -0.197, -0.567, -0.567, -0.197, -0.356, 2.339, -0.461, -0.144, -0.408, -0.197, -0.461, -0.567, -0.36, -0.308, -0.36, 2.239, -0.672, 3.487, -0.204, -0.308, -0.62, -0.412, -0.568, -0.256, -0.204, -0.204, -0.464, -0.412, -0.568, -0.36, -0.1, -0.412, 2.135, -0.62, 2.447, -0.152, -0.308, 2.447, -0.62, -0.308, -0.256, -0.412, -0.464, -0.412, -0.516, -0.204, 3.435, -0.308, -0.516, -0.256, -0.36, -0.464, -0.256, -0.464, 2.551, -0.568, -0.412, -0.568, -0.204, -0.152, 2.603, -0.308, -0.62, -0.204, -0.204, 3.123, -0.62, 2.915, -0.1, -0.36, 1.771, -0.256, -0.36, -0.256, -0.464, -0.36, -0.568, -0.412, -0.36, -0.568, -0.568, -0.412, -0.204, -0.464, -0.204, -0.412, -0.204, 2.811, -0.412, -0.516, -0.36, -0.464, -0.308, -0.152, -0.308, -0.204, 3.019, -0.464, -0.152, -0.256, -0.412, -0.412, -0.204, -0.568, -0.308, -0.256, -0.568, -0.204, -0.308, -0.36, -0.568, -0.464, -0.308, -0.516, -0.308, -0.568, -0.256, -0.204, -0.516, -0.204, -0.516, -0.1, 2.083, -0.568, -0.568, 2.187, -0.568, -0.568, -0.256, -0.62, -0.204, -0.36, -0.256, -0.256, 2.759, -0.152, 2.499, -0.62, -0.204, -0.62)
fifaRaw <- c(fifaRaw, -0.36, 2.499, -0.568, -0.412, -0.412, -0.204, -0.308, -0.256, -0.256, 2.187, -0.412, -0.62, -0.672, -0.204, -0.568, -0.464, -0.204, -0.256, -0.568, 3.071, -0.256, -0.256, -0.516, -0.308, -0.152, -0.1, -0.516, -0.308, -0.568, -0.308, -0.568, -0.308, -0.36, -0.516, -0.1, -0.412, 2.707, -0.256, -0.412, 2.967, 2.655, 2.499, -0.308, -0.568, -0.256, 2.499, -0.308, -0.152, -0.516, -0.256, -0.36, -0.204, -0.672, 1.668, -0.516, -0.152, -0.1, -0.36, -0.256, -0.62, -0.464, -0.568, -0.516, -0.308, 1.875, 2.395, -0.204, -0.568, -0.516, -0.464, -0.308, 2.551, 2.759, -0.516, -0.36, -0.62, -0.204, -0.204, -0.464, -0.308, -0.568, -0.516, -0.36, -0.412, -0.152, -0.464, -0.412, -0.36, -0.464, -0.412, -0.256, -0.412, -0.204, -0.256, -0.36, -0.308, -0.412, -0.204, -0.464, -0.36, -0.204, 1.771, -0.464, -0.62, -0.568, -0.36, 2.603, -0.464, -0.256, 2.135, -0.412, -0.204, 2.395, -0.62, -0.62, -0.516, -0.36, -0.204, -0.308, -0.464, -0.308, -0.62, -0.568, -0.204, -0.568, -0.464, -0.412, -0.62, -0.568, -0.204, 1.46, -0.412, -0.204, -0.36, 2.343, -0.412, -0.464, -0.36, -0.36, -0.256, -0.516, -0.308, -0.204, -0.152, -0.204, 3.071, -0.1, -0.308, -0.516, -0.256, -0.1, 1.979, -0.152, -0.152, 2.499, -0.36, -0.36, -0.412, -0.36, -0.204, -0.1, 2.291, -0.308, -0.152, -0.308, -0.36, 2.187, 2.395, -0.464, -0.464, -0.308, -0.256, 2.343, -0.412, -0.568, 2.343, -0.516, -0.568, -0.256, -0.308, -0.412, -0.516, -0.62, -0.464, -0.308, -0.568, -0.464, -0.152, -0.464, -0.62, -0.464, -0.464, -0.204, -0.672, 3.019, -0.62, -0.204, -0.256, -0.568, -0.36, -0.412, -0.308, -0.412, 2.655, -0.464, -0.672, 2.863, -0.568, -0.308, -0.412, -0.204, 1.927, -0.1, -0.36, -0.62, -0.516, -0.568, -0.256, -0.516, -0.516, -0.152, -0.62, -0.568, -0.204, -0.464, -0.152, 2.447, -0.36, -0.36, -0.36, 2.655, -0.412, -0.308, -0.412, -0.204, -0.36, -0.412, -0.412, -0.412, -0.568, -0.516, -0.256, -0.62, 2.759, -0.516, 2.603, -0.36, -0.36, -0.204, -0.152, -0.464, -0.204, -0.1, -0.308, -0.412, -0.308, -0.36, -0.516, 2.915, -0.62, -0.204, -0.412, -0.62, -0.308, -0.152, 2.187, -0.568, -0.36, -0.308, -0.308, -0.256, -0.464, 1.875, -0.516, -0.516, -0.62, -0.568, -0.36, -0.36, -0.1, 2.811, -0.672, -0.204, -0.568, -0.412, -0.516, -0.464, -0.308, -0.36, -0.308, -0.36, -0.36, -0.204, -0.36, -0.62, -0.516, 1.927, -0.256, -0.62, -0.62, -0.62, -0.568, 2.187, -0.568, 2.603, -0.62, -0.568, -0.308, -0.62, -0.516, -0.152, -0.256, -0.36, -0.1, -0.464, -0.256, -0.308, -0.568, -0.308, -0.516, 2.759, -0.36, -0.412, -0.568, -0.412, 3.123, 2.759, -0.308, -0.36, -0.152, -0.204, -0.62, -0.516, -0.204, -0.152, -0.152, -0.516, -0.62, -0.516, -0.256, 2.863, -0.412, -0.412, -0.36, -0.412, -0.204, 2.395, 2.135, 2.499, -0.36, -0.464, -0.204, -0.308, -0.568, -0.412, -0.256, -0.516, -0.256)
fifaRaw <- c(fifaRaw, -0.152, -0.568, -0.672, -0.62, -0.516, -0.308, -0.568, 1.823, -0.36, -0.516, -0.36, -0.464, -0.568, -0.256, -0.077, -0.31, -0.205, -0.373, -0.371, 1.509, -0.278, -0.36, -0.282, -0.251, -0.378, -0.26, -0.319, -0.273, 1.145, -0.382, -0.346, 2.421, -0.205, -0.389, -0.373, -0.328, -0.328, -0.395, -0.31, -0.367, -0.282, -0.278, -0.305, 0.47, -0.389, -0.375, -0.333, -0.314, 1.236, -0.356, -0.333, -0.077, -0.223, 0.123, 0.871, -0.205, -0.292, -0.205, 0.214, 0.871, 0.178, -0.241, -0.384, -0.356, 1.418, -0.26, -0.31, 0.251, -0.319, -0.041, 0.506, -0.333, -0.384, -0.187, -0.296, 0.78, -0.205, -0.187, -0.342, -0.187, -0.342, -0.364, -0.31, -0.241, -0.356, -0.31, -0.287, 13.545, -0.241, -0.187, -0.132, -0.346, -0.278, 1.145, 0.014, -0.305, -0.282, -0.342, -0.323, -0.319, -0.391, -0.373, -0.364, -0.114, 0.397, -0.314, -0.255, -0.282, -0.205, -0.205, -0.328, 0.78, -0.273, -0.278, -0.241, -0.132, -0.223, -0.382, -0.278, -0.362, -0.382, -0.292, -0.223, 0.324, -0.376, -0.041, 0.689, -0.378, -0.301, 0.689, -0.333, -0.373, -0.305, -0.187, -0.351, -0.187, -0.077, 0.251, -0.367, -0.391, 0.087, 1.236, 0.397, -0.342, -0.314, -0.369, -0.333, -0.333, -0.251, -0.346, -0.346, -0.369, -0.391, -0.255, 0.16, -0.228, 5.339, 0.36, -0.354, -0.077, -0.023, 0.306, -0.004, -0.301, -0.356, -0.296, -0.273, -0.077, 0.506, -0.205, -0.287, 0.251, -0.187, 0.214, -0.237, -0.168, -0.337, -0.282, -0.278, -0.223, -0.333, 0.196, -0.323, -0.328, -0.373, 0.269, -0.351, -0.354, -0.31, -0.337, -0.187, -0.319, -0.346, -0.328, -0.346, -0.4, -0.296, -0.059, -0.31, -0.168, -0.077, -0.323, -0.382, -0.375, -0.168, -0.114, -0.395, -0.31, -0.384, -0.292, -0.187, -0.296, 0.488, -0.232, -0.132, -0.387, -0.319, -0.273, -0.333, -0.36, -0.114, -0.273, 0.36, 0.415, -0.187, -0.305, 0.251, -0.385, 1.509, -0.319, -0.223, -0.223, -0.346, -0.387, -0.15, -0.382, -0.319, -0.391, 0.78, -0.205, -0.205, -0.301, -0.278, -0.376, -0.342, -0.305, -0.264, 1.418, -0.041, -0.187, 0.36, -0.358, 0.16, -0.319, -0.301, -0.31, 0.597, -0.358, -0.041, -0.319, -0.278, -0.38, -0.387, -0.323, 0.178, -0.041, -0.205, -0.023, -0.292, -0.337, 0.397, -0.251, -0.395, -0.387, -0.38, -0.378, -0.282, -0.387, -0.228, -0.278, -0.228, -0.246, 2.695, -0.269, -0.364, 5.795, 1.236, 0.306, -0.333, -0.36, -0.384, -0.237, 1.053, -0.395, -0.342, 0.178, -0.323, 0.379, 1.6, -0.333, 1.236, -0.077, 0.597, -0.333, -0.168, -0.241, 0.78, -0.367, -0.393, -0.328, -0.387, 0.506, 0.506, -0.342, -0.382, 6.251, -0.337, -0.323, -0.278, -0.269, 0.324, 4.609, -0.26, -0.296, -0.132, -0.246, -0.273, 0.597, -0.205, -0.351, -0.391, -0.041, -0.223, -0.36, -0.351, 0.105, 0.269, -0.296, 0.488, 2.695, -0.385, 1.783, -0.205, -0.376)
fifaRaw <- c(fifaRaw, -0.287, -0.223, -0.337, -0.333, -0.096, -0.269, -0.353, -0.246, -0.305, -0.384, -0.356, 0.78, -0.246, 0.142, -0.26, -0.205, -0.384, -0.228, 0.196, -0.31, 1.145, -0.38, -0.205, 0.324, -0.323, -0.387, -0.319, -0.241, -0.36, -0.393, 0.214, 0.032, -0.367, -0.223, -0.15, -0.353, 0.287, 1.874, -0.31, -0.26, -0.187, -0.36, -0.351, -0.251, -0.241, -0.376, -0.337, 0.196, -0.059, -0.38, -0.269, -0.342, -0.237, -0.328, -0.273, -0.38, 0.105, -0.168, -0.114, 0.397, -0.389, 0.032, -0.31, -0.38, 1.053, 0.105, -0.077, 0.962, 1.053, -0.389, -0.395, 0.78, -0.369, -0.365, -0.282, -0.264, -0.342, 0.689, -0.223, -0.305, -0.059, -0.395, 0.251, -0.292, -0.096, 0.452, -0.369, -0.255, -0.264, 2.421, -0.314, -0.337, -0.31, -0.168, -0.396, -0.351, -0.023, -0.269, 0.433, -0.362, -0.378, -0.282, -0.36, -0.382, -0.232, -0.278, 0.306, -0.255, 0.689, 0.014, -0.168, -0.328, -0.314, 0.324, -0.365, -0.369, -0.337, 0.597, -0.282, -0.328, -0.287, -0.228, -0.023, 0.871, 0.069, -0.365, -0.354, -0.323, 0.306, -0.396, -0.378, -0.26, -0.273, 0.689, -0.301, -0.393, 0.306, -0.292, -0.132, -0.31, 0.597, -0.369, -0.255, -0.351, -0.358, -0.337, -0.358, 8.895, 0.306, -0.319, 0.178, -0.393, -0.077, -0.31, -0.38, 0.36, 0.105, -0.337, -0.205, -0.337, -0.375, 1.509, -0.205, -0.389, 0.397, -0.389, 0.105, 1.418, -0.328, -0.365)
fifa19mtx <- matrix(data=fifaRaw, ncol=37, nrow=500, byrow=FALSE)
fifa19_scaled <- as.data.frame(fifa19mtx)
names(fifa19_scaled) <- c('Age', 'Potential', 'Crossing', 'Finishing', 'HeadingAccuracy', 'ShortPassing', 'Volleys', 'Dribbling', 'Curve', 'FKAccuracy', 'LongPassing', 'BallControl', 'Acceleration', 'SprintSpeed', 'Agility', 'Reactions', 'Balance', 'ShotPower', 'Jumping', 'Stamina', 'Strength', 'LongShots', 'Aggression', 'Interceptions', 'Positioning', 'Vision', 'Penalties', 'Composure', 'Marking', 'StandingTackle', 'SlidingTackle', 'GKDiving', 'GKHandling', 'GKKicking', 'GKPositioning', 'GKReflexes', 'PlayerValue')
str(fifa19_scaled)
## 'data.frame': 500 obs. of 37 variables:
## $ Age : num 0.569 -1.555 -0.068 -0.705 -1.342 ...
## $ Potential : num -0.198 1.373 -0.023 -0.198 -0.023 ...
## $ Crossing : num 0.618 0.095 0.409 -1.841 0.042 ...
## $ Finishing : num 1.232 0.475 1.081 -1.242 -0.636 ...
## $ HeadingAccuracy: num 0.467 0.189 0.801 -1.867 -0.033 ...
## $ ShortPassing : num 0.061 0.26 0.458 -2.386 0.26 ...
## $ Volleys : num 1.011 0.622 0.733 -1.488 -0.044 ...
## $ Dribbling : num 0.824 0.302 0.824 -1.837 0.093 ...
## $ Curve : num 0.902 0.902 0.635 -1.657 -0.164 ...
## $ FKAccuracy : num 1.237 0.747 0.747 -1.432 -0.015 ...
## $ LongPassing : num -0.572 0.269 0.01 -2.124 0.463 ...
## $ BallControl : num 0.613 0.213 0.556 -2.13 -0.358 ...
## $ Acceleration : num 1.784 0.282 1 -2.003 -0.044 ...
## $ SprintSpeed : num 1.292 0.086 0.823 -2.057 -0.048 ...
## $ Agility : num 1.582 0.508 1.247 -2.515 -0.298 ...
## $ Reactions : num 0.662 -0.582 0.21 -0.921 -1.034 ...
## $ Balance : num 1.737 0.969 1.249 -0.705 0.341 ...
## $ ShotPower : num 1.18 0.573 0.628 -1.854 -0.144 ...
## $ Jumping : num 0.881 -0.454 0.714 -2.038 -0.454 ...
## $ Stamina : num 1.44 0.202 -0.151 -2.095 -0.799 ...
## $ Strength : num -0.138 -0.787 -0.381 -2.164 -1.597 ...
## $ LongShots : num 1.229 0.426 0.627 -1.481 -0.377 ...
## $ Aggression : num -0.79 0.142 -1.373 -2.189 0.434 ...
## $ Interceptions : num -1.686 0.133 -0.537 -1.207 0.277 ...
## $ Positioning : num 1.227 0.436 0.881 -1.738 -0.848 ...
## $ Vision : num 0.517 0.517 0.731 -1.907 0.374 ...
## $ Penalties : num 1.026 0.712 0.712 -1.606 0.086 ...
## $ Composure : num 0.801 0.31 -0.673 -1.573 -0.918 ...
## $ Marking : num 0.232 -0.472 -0.522 -1.428 0.232 ...
## $ StandingTackle : num -1.551 -0.023 -1.506 -1.281 0.382 ...
## $ SlidingTackle : num -1.516 0.053 -1.331 -1.377 0.837 ...
## $ GKDiving : num -0.156 -0.523 -0.418 2.156 -0.261 ...
## $ GKHandling : num -0.485 -0.323 -0.216 1.987 -0.592 ...
## $ GKKicking : num -0.525 -0.25 -0.25 1.999 -0.36 ...
## $ GKPositioning : num -0.567 -0.356 -0.144 2.074 -0.408 ...
## $ GKReflexes : num -0.36 -0.308 -0.36 2.239 -0.672 ...
## $ PlayerValue : num -0.077 -0.31 -0.205 -0.373 -0.371 ...
# Glimpse at the dataset
glimpse(fifa19_scaled)
## Observations: 500
## Variables: 37
## $ Age <dbl> 0.569, -1.555, -0.068, -0.705, -1.342, -0.280, -0.0...
## $ Potential <dbl> -0.198, 1.373, -0.023, -0.198, -0.023, 2.246, -0.19...
## $ Crossing <dbl> 0.618, 0.095, 0.409, -1.841, 0.042, -1.789, -0.586,...
## $ Finishing <dbl> 1.232, 0.475, 1.081, -1.242, -0.636, -1.646, -0.283...
## $ HeadingAccuracy <dbl> 0.467, 0.189, 0.801, -1.867, -0.033, -2.200, 0.690,...
## $ ShortPassing <dbl> 0.061, 0.260, 0.458, -2.386, 0.260, -2.121, -0.137,...
## $ Volleys <dbl> 1.011, 0.622, 0.733, -1.488, -0.044, -1.599, -0.877...
## $ Dribbling <dbl> 0.824, 0.302, 0.824, -1.837, 0.093, -2.150, -1.107,...
## $ Curve <dbl> 0.902, 0.902, 0.635, -1.657, -0.164, -1.870, -0.804...
## $ FKAccuracy <dbl> 1.237, 0.747, 0.747, -1.432, -0.015, -1.649, -0.887...
## $ LongPassing <dbl> -0.572, 0.269, 0.010, -2.124, 0.463, -1.801, -0.572...
## $ BallControl <dbl> 0.613, 0.213, 0.556, -2.130, -0.358, -2.073, -0.530...
## $ Acceleration <dbl> 1.784, 0.282, 1.000, -2.003, -0.044, -1.481, -1.220...
## $ SprintSpeed <dbl> 1.292, 0.086, 0.823, -2.057, -0.048, -1.789, -1.186...
## $ Agility <dbl> 1.582, 0.508, 1.247, -2.515, -0.298, -1.709, -0.366...
## $ Reactions <dbl> 0.662, -0.582, 0.210, -0.921, -1.034, -0.017, -0.35...
## $ Balance <dbl> 1.737, 0.969, 1.249, -0.705, 0.341, -1.403, -0.008,...
## $ ShotPower <dbl> 1.180, 0.573, 0.628, -1.854, -0.144, -1.964, -0.695...
## $ Jumping <dbl> 0.881, -0.454, 0.714, -2.038, -0.454, 0.714, 0.881,...
## $ Stamina <dbl> 1.440, 0.202, -0.151, -2.095, -0.799, -1.153, 0.556...
## $ Strength <dbl> -0.138, -0.787, -0.381, -2.164, -1.597, -0.868, 0.7...
## $ LongShots <dbl> 1.229, 0.426, 0.627, -1.481, -0.377, -1.481, -0.226...
## $ Aggression <dbl> -0.790, 0.142, -1.373, -2.189, 0.434, -1.315, 0.725...
## $ Interceptions <dbl> -1.686, 0.133, -0.537, -1.207, 0.277, -1.351, 0.516...
## $ Positioning <dbl> 1.227, 0.436, 0.881, -1.738, -0.848, -1.688, -1.293...
## $ Vision <dbl> 0.517, 0.517, 0.731, -1.907, 0.374, -0.838, -1.123,...
## $ Penalties <dbl> 1.026, 0.712, 0.712, -1.606, 0.086, -1.042, -0.854,...
## $ Composure <dbl> 0.801, 0.310, -0.673, -1.573, -0.918, 0.064, -0.263...
## $ Marking <dbl> 0.232, -0.472, -0.522, -1.428, 0.232, -1.377, 0.836...
## $ StandingTackle <dbl> -1.551, -0.023, -1.506, -1.281, 0.382, -1.641, 1.01...
## $ SlidingTackle <dbl> -1.516, 0.053, -1.331, -1.377, 0.837, -1.562, 0.976...
## $ GKDiving <dbl> -0.156, -0.523, -0.418, 2.156, -0.261, 3.364, -0.62...
## $ GKHandling <dbl> -0.485, -0.323, -0.216, 1.987, -0.592, 3.115, -0.21...
## $ GKKicking <dbl> -0.525, -0.250, -0.250, 1.999, -0.360, 2.328, -0.52...
## $ GKPositioning <dbl> -0.567, -0.356, -0.144, 2.074, -0.408, 3.078, -0.14...
## $ GKReflexes <dbl> -0.360, -0.308, -0.360, 2.239, -0.672, 3.487, -0.20...
## $ PlayerValue <dbl> -0.077, -0.310, -0.205, -0.373, -0.371, 1.509, -0.2...
coefs <- data.frame(OLS=as.vector(lm(PlayerValue ~ . -Interceptions, data=fifa19_scaled)$coef[-1]))
coefs
## OLS
## 1 0.14080348
## 2 0.52499805
## 3 0.13608502
## 4 0.15456543
## 5 -0.01529349
## 6 -0.22318485
## 7 0.18580783
## 8 -0.29767876
## 9 0.02934133
## 10 0.05287707
## 11 0.17011601
## 12 0.17507295
## 13 0.06438190
## 14 0.03084774
## 15 -0.01369804
## 16 0.19450983
## 17 0.02776689
## 18 -0.10462695
## 19 -0.08338769
## 20 -0.07010475
## 21 0.02518615
## 22 0.00198291
## 23 -0.01617527
## 24 -0.12197746
## 25 -0.05634959
## 26 -0.03740878
## 27 -0.04404595
## 28 -0.04982907
## 29 0.20388990
## 30 -0.09861257
## 31 0.11197957
## 32 0.08476926
## 33 0.14563308
## 34 -0.30283233
## 35 -0.10014565
# Ridge regression: mdlRidge
mdlRidge <- caret::train(PlayerValue ~ ., data = fifa19_scaled, method = "ridge", tuneLength = 8)
## Loading required package: lattice
##
## Attaching package: 'lattice'
## The following object is masked _by_ '.GlobalEnv':
##
## barley
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
# Plot ridge train object
plot(mdlRidge)
# Ridge regression coefficients
coefRidge <- predict(mdlRidge$finalModel, type='coef', mode='norm')$coefficients
coefs$RidgeAll <- coefRidge[nrow(coefRidge),]
print(coefs)
## OLS RidgeAll
## 1 0.14080348 0.063757232
## 2 0.52499805 0.453334036
## 3 0.13608502 0.067490561
## 4 0.15456543 0.060594977
## 5 -0.01529349 0.008445445
## 6 -0.22318485 -0.059108238
## 7 0.18580783 0.124150501
## 8 -0.29767876 -0.074407150
## 9 0.02934133 0.038889391
## 10 0.05287707 0.057464591
## 11 0.17011601 0.068074550
## 12 0.17507295 0.039128679
## 13 0.06438190 0.040408023
## 14 0.03084774 0.017734038
## 15 -0.01369804 -0.003036336
## 16 0.19450983 0.199081287
## 17 0.02776689 0.006185195
## 18 -0.10462695 -0.040956383
## 19 -0.08338769 -0.080734840
## 20 -0.07010475 -0.046693927
## 21 0.02518615 0.019689478
## 22 0.00198291 0.021424749
## 23 -0.01617527 -0.009860133
## 24 -0.12197746 -0.062989656
## 25 -0.05634959 -0.051736655
## 26 -0.03740878 -0.005134552
## 27 -0.04404595 0.008680042
## 28 -0.04982907 -0.003446574
## 29 0.20388990 0.078066864
## 30 -0.09861257 0.016096396
## 31 0.11197957 0.052573952
## 32 0.08476926 0.034556053
## 33 0.14563308 0.042359446
## 34 -0.30283233 -0.033414894
## 35 -0.10014565 0.008586468
# Lasso regression: mdlLasso
mdlLasso <- caret::train(PlayerValue ~ ., data = fifa19_scaled, method = "lasso", tuneLength = 8)
# Plot lasso object
plot(mdlLasso)
# Get coefficients in every step: coefLasso
coefLasso <- predict(mdlLasso$finalModel, type='coef', mode='norm')$coefficients
# Get coefficients for top 5 and all variables
(coefs$LassoTop5 <- coefLasso[6, ])
## Age Potential Crossing Finishing HeadingAccuracy
## 0.000000e+00 3.934720e-01 0.000000e+00 0.000000e+00 0.000000e+00
## ShortPassing Volleys Dribbling Curve FKAccuracy
## 0.000000e+00 6.717817e-03 0.000000e+00 0.000000e+00 6.410262e-02
## LongPassing BallControl Acceleration SprintSpeed Agility
## 6.695871e-06 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
## Reactions Balance ShotPower Jumping Stamina
## 2.081593e-01 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
## Strength LongShots Aggression Positioning Vision
## 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
## Penalties Composure Marking StandingTackle SlidingTackle
## 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
## GKDiving GKHandling GKKicking GKPositioning GKReflexes
## 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
(coefs$LassoAll <- coefLasso[nrow(coefLasso), ])
## Age Potential Crossing Finishing HeadingAccuracy
## 0.14080348 0.52499805 0.13608502 0.15456543 -0.01529349
## ShortPassing Volleys Dribbling Curve FKAccuracy
## -0.22318485 0.18580783 -0.29767876 0.02934133 0.05287707
## LongPassing BallControl Acceleration SprintSpeed Agility
## 0.17011601 0.17507295 0.06438190 0.03084774 -0.01369804
## Reactions Balance ShotPower Jumping Stamina
## 0.19450983 0.02776689 -0.10462695 -0.08338769 -0.07010475
## Strength LongShots Aggression Positioning Vision
## 0.02518615 0.00198291 -0.01617527 -0.12197746 -0.05634959
## Penalties Composure Marking StandingTackle SlidingTackle
## -0.03740878 -0.04404595 -0.04982907 0.20388990 -0.09861257
## GKDiving GKHandling GKKicking GKPositioning GKReflexes
## 0.11197957 0.08476926 0.14563308 -0.30283233 -0.10014565
# ElasticNet regression: mdlElasticNet
mdlElasticNet <- caret::train(PlayerValue ~ ., data = fifa19_scaled, method = "enet", tuneLength = 8)
# Plot elastic net object
plot(mdlElasticNet)
# Get elastic net coefficients: coefElasticNet
coefElasticNet <- predict(mdlElasticNet$finalModel, type="coef", mode="norm")$coefficients
# Get coefficients for top 5 and all variables
(coefs$ElasticNetTop5 <- coefElasticNet[6, ])
## Age Potential Crossing Finishing HeadingAccuracy
## 0.000000000 0.394474977 0.000000000 0.000000000 0.000000000
## ShortPassing Volleys Dribbling Curve FKAccuracy
## 0.000000000 0.012047805 0.000000000 0.000000000 0.060546666
## LongPassing BallControl Acceleration SprintSpeed Agility
## 0.006535522 0.000000000 0.000000000 0.000000000 0.000000000
## Reactions Balance ShotPower Jumping Stamina
## 0.212491467 0.000000000 0.000000000 0.000000000 0.000000000
## Strength LongShots Aggression Positioning Vision
## 0.000000000 0.000000000 0.000000000 0.000000000 0.000000000
## Penalties Composure Marking StandingTackle SlidingTackle
## 0.000000000 0.000000000 0.000000000 0.000000000 0.000000000
## GKDiving GKHandling GKKicking GKPositioning GKReflexes
## 0.000000000 0.000000000 0.000000000 0.000000000 0.000000000
(coefs$ElasticNetAll <- coefElasticNet[nrow(coefElasticNet), ])
## Age Potential Crossing Finishing HeadingAccuracy
## 0.105432609 0.491074885 0.095533534 0.099838431 -0.001408899
## ShortPassing Volleys Dribbling Curve FKAccuracy
## -0.125702820 0.156053369 -0.151937955 0.034892424 0.054804480
## LongPassing BallControl Acceleration SprintSpeed Agility
## 0.111216957 0.069901622 0.052288414 0.021620556 -0.008381661
## Reactions Balance ShotPower Jumping Stamina
## 0.194134254 0.016142715 -0.074661110 -0.084036138 -0.058290987
## Strength LongShots Aggression Positioning Vision
## 0.022426786 0.018131234 -0.013931411 -0.091620323 -0.057940407
## Penalties Composure Marking StandingTackle SlidingTackle
## -0.020553899 -0.018366690 -0.025703054 0.111342215 -0.012303960
## GKDiving GKHandling GKKicking GKPositioning GKReflexes
## 0.060755399 0.043300637 0.061644527 -0.117459268 -0.029230937
# Fit MLP using nnet: mdlNNet
# set.seed(124)
# mdlNNet <- nnet(Class ~ ., data = pulsar_train, size = 3)
# Calculate train error: train_error
# pred_train <- predict(mdlNNet, pulsar_train, type="class")
# train_cm <- table(pred_train, pulsar_train$Class)
# (train_error <- 1 - sum(diag(train_cm)) / sum(train_cm))
# Calculate test error: test_error
# pred_test <- predict(mdlNNet, pulsar_test, type="class")
# test_cm <- table(pred_test, pulsar_test$Class)
# (test_error <- 1 - sum(diag(test_cm)) / sum(test_cm))
# Fit MLP using nnet: mdlNNet
# set.seed(124)
# mdlNNet <- nnet(Class ~ ., data = pulsar_train, size = 5)
# Calculate train error: train_error
# pred_train <- predict(mdlNNet, pulsar_train, type="class")
# train_cm <- table(pred_train, pulsar_train$Class)
# (train_error <- 1 - sum(diag(train_cm)) / sum(train_cm))
# Calculate test error: test_error
# pred_test <- predict(mdlNNet, pulsar_test, type="class")
# test_cm <- table(pred_test, pulsar_test$Class)
# (test_error <- 1 - sum(diag(test_cm)) / sum(test_cm))
# Create the 5-fold cross validation training control object
# control <- trainControl(method = "cv", number = 5, savePredictions = TRUE, classProbs = TRUE)
# Create the vector of base learners: baseLearners
# baseLearners <- c('rpart', 'glm', 'knn', 'svmRadial')
# Create and summarize the list of base learners: models
# models <- caretList(Class ~ ., data = training, trControl = control, methodList = baseLearners)
# summary(models)
# Classification results in each resample: results
# results <- resamples(models)
# Summarize and print the results in one line
# (results_summary <- summary(results))
# Show the correlation among the base learners' results
# modelCor(results)
# Display a scatter plot matrix of these results
# splom(results)
# Load caretEnsemble
# library(caretEnsemble)
# Set the seed
# set.seed(123)
# Stack the base learners
# stack.glm <- caretStack(models, method="glm", metric="Accuracy", trControl=control)
# Print the stacked model
# stack.glm
# Summarize the performance results for each base learner
# summary(results)
evaluateModel <- function(trainObject, testData) {
# Compute binary yes/no predictions and class probabilities
model_preds <- predict(trainObject, testData)
model_probs <- predict(trainObject, testData, type="prob")
# Compute accuracy and AUC values
model_acc <- accuracy(testData$Class, model_preds)
model_auc <- auc(testData$Class == 'yes', model_probs[, 2])
# Return model accuracy and AUC
c(model_acc, model_auc)
}
# Evaluate the performance of each individual base learner
# baseLearnerStats <- sapply(X=stack.glm$models, FUN=evaluateModel, testing)
# baseLearnerDF <- data.frame(baseLearnerStats, row.names = c('acc', 'auc'))
# Compute stacked ensemble's accuracy on test data
# stack_preds <- predict(stack.glm, testing)
# stack_acc <- accuracy(testing$Class, stack_preds)
# Compute stacked ensemble's AUC on test data
# stack_preds_probs <- predict(stack.glm, testing, type="prob")
# stack_auc <- auc(testing$Class == 'yes', stack_preds_probs)
# Combine the stacked ensemble results
# (allLearnersDF <- cbind(baseLearnerDF, list(stack=c(stack_acc, stack_auc))))
Chapter 3 - Unsupervised Learning
K-means Clustering:
Clustering Algorithms:
Feature Selection:
Feature Extraction:
Example code includes:
mallData <- c(19, 21, 20, 23, 31, 22, 35, 23, 64, 30, 67, 35, 58, 24, 37, 22, 35, 20, 52, 35, 35, 25, 46, 31, 54, 29, 45, 35, 40, 23, 60, 21, 53, 18, 49, 21, 42, 30, 36, 20, 65, 24, 48, 31, 49, 24, 50, 27, 29, 31, 49, 33, 31, 59, 50, 47, 51, 69, 27, 53, 70, 19, 67, 54, 63, 18, 43, 68, 19, 32, 70, 47, 60, 60, 59, 26, 45, 40, 23, 49, 57, 38, 67, 46, 21, 48, 55, 22, 34, 50, 68, 18, 48, 40, 32, 24, 47, 27, 48, 20, 23, 49, 67, 26, 49, 21, 66, 54, 68, 66, 65, 19, 38, 19, 18, 19, 63, 49, 51, 50, 27, 38, 40, 39, 23, 31, 43, 40, 59, 38, 47, 39, 25, 31, 20, 29, 44, 32, 19, 35, 57, 32, 28, 32, 25, 28, 48, 32, 34, 34, 43, 39, 44, 38, 47, 27, 37, 30, 34, 30, 56, 29, 19, 31, 50, 36, 42, 33, 36, 32, 40, 28, 36, 36, 52, 30, 58, 27, 59, 35, 37, 32, 46, 29, 41, 30, 54, 28, 41, 36, 34, 32, 33, 38, 47, 35, 45, 32, 32, 30, 15000, 15000, 16000, 16000, 17000, 17000, 18000, 18000, 19000, 19000, 19000, 19000, 20000, 20000, 20000, 20000, 21000, 21000, 23000, 23000, 24000, 24000, 25000, 25000, 28000, 28000, 28000, 28000, 29000, 29000, 30000, 30000, 33000, 33000, 33000, 33000, 34000, 34000, 37000, 37000, 38000, 38000, 39000, 39000, 39000, 39000, 40000, 40000, 40000, 40000, 42000, 42000, 43000, 43000, 43000, 43000, 44000, 44000, 46000, 46000, 46000, 46000, 47000, 47000, 48000, 48000, 48000, 48000, 48000, 48000, 49000, 49000, 50000, 50000, 54000, 54000, 54000, 54000, 54000, 54000, 54000, 54000, 54000, 54000, 54000, 54000, 57000, 57000, 58000, 58000, 59000, 59000, 60000, 60000, 60000, 60000, 60000, 60000, 61000, 61000, 62000, 62000, 62000, 62000, 62000, 62000, 63000, 63000, 63000, 63000, 63000, 63000, 64000, 64000, 65000, 65000, 65000, 65000, 67000, 67000, 67000, 67000, 69000, 69000, 70000, 70000, 71000, 71000, 71000, 71000, 71000, 71000, 72000, 72000, 73000, 73000, 73000, 73000, 74000, 74000, 75000, 75000, 76000, 76000, 77000, 77000, 77000, 77000, 78000, 78000, 78000, 78000, 78000, 78000, 78000, 78000, 78000, 78000, 78000, 78000, 79000, 79000, 81000, 81000, 85000, 85000, 86000, 86000, 87000, 87000, 87000, 87000, 87000, 87000, 88000, 88000, 88000, 88000, 93000, 93000, 97000, 97000, 98000, 98000, 99000, 99000, 101000, 101000, 103000, 103000, 103000, 103000, 113000, 113000, 120000, 120000, 126000, 126000, 137000, 137000, 39, 81, 6, 77, 40, 76, 6, 94, 3, 72, 14, 99, 15, 77, 13, 79, 35, 66, 29, 98, 35, 73, 5, 73, 14, 82, 32, 61, 31, 87, 4, 73, 4, 92, 14, 81, 17, 73, 26, 75, 35, 92, 36, 61, 28, 65, 55, 47, 42, 42, 52, 60, 54, 60, 45, 41, 50, 46, 51, 46, 56, 55, 52, 59, 51, 59, 50, 48, 59, 47, 55, 42, 49, 56, 47, 54, 53, 48, 52, 42, 51, 55, 41, 44, 57, 46, 58, 55, 60, 46, 55, 41, 49, 40, 42, 52, 47, 50, 42, 49, 41, 48, 59, 55, 56, 42, 50, 46, 43, 48, 52, 54, 42, 46, 48, 50, 43, 59, 43, 57, 56, 40, 58, 91, 29, 77, 35, 95, 11, 75, 9, 75, 34, 71, 5, 88, 7, 73, 10, 72, 5, 93, 40, 87, 12, 97, 36, 74, 22, 90, 17, 88, 20, 76, 16, 89, 1, 78, 1, 73, 35, 83, 5, 93, 26, 75, 20, 95, 27, 63, 13, 75, 10, 92, 13, 86, 15, 69, 14, 90, 32, 86, 15, 88, 39, 97, 24, 68, 17, 85, 23, 69, 8, 91, 16, 79, 28, 74, 18, 83)
mall <- as.data.frame(matrix(data=mallData, ncol=3, byrow=FALSE))
names(mall) <- c("Age", "AnnualIncome", "SpendingScore")
# Glimpse over the mall data
glimpse(mall)
## Observations: 200
## Variables: 3
## $ Age <dbl> 19, 21, 20, 23, 31, 22, 35, 23, 64, 30, 67, 35, 58, 2...
## $ AnnualIncome <dbl> 15000, 15000, 16000, 16000, 17000, 17000, 18000, 1800...
## $ SpendingScore <dbl> 39, 81, 6, 77, 40, 76, 6, 94, 3, 72, 14, 99, 15, 77, ...
# Display the range of every variable
sapply(mall, range)
## Age AnnualIncome SpendingScore
## [1,] 18 15000 1
## [2,] 70 137000 99
# Age histogram
hist(mall$Age, breaks=10)
# Spending score histogram
hist(mall$SpendingScore, breaks=10)
# Annual income histogram
hist(mall$AnnualIncome, breaks=10)
mall_scaled <- scale(mall)
# Initialize vector: ratios
ratios <- rep(0, 10)
# Try different values of K
for (k in 1:10) {
# Cluster mall: mall_c
mall_c <- kmeans(mall_scaled, k, nstart=20)
# Save the ratio WSS/TSS in the kth position of ratios
ratios[k] <- mall_c$tot.withinss / mall_c$totss
}
# Line plot with ratios as a function of k
plot(ratios, type="b", xlab="number of clusters")
# Cluster mall_scaled data using k = 6: mall_6
set.seed(123)
mall_6 <- kmeans(mall_scaled, centers=6, nstart=20)
# Average each variable per cluster
mall %>%
mutate(cluster = mall_6$cluster) %>%
group_by(cluster) %>%
summarize_all(list(~mean(.)))
## # A tibble: 6 x 4
## cluster Age AnnualIncome SpendingScore
## <int> <dbl> <dbl> <dbl>
## 1 1 56.3 54267. 49.1
## 2 2 41.9 88939. 17.0
## 3 3 32.7 86538. 82.1
## 4 4 26.7 57579. 47.8
## 5 5 25.2 25833. 76.9
## 6 6 45.5 26286. 19.4
library(clValid)
## Loading required package: cluster
# Create the list of clustering methods: methods
methods <- c("hierarchical", "kmeans", "pam")
# Compare clustering methods: results
results <- clValid::clValid(mall_scaled, 2:10, clMethods = methods, validation = "internal")
## Warning in clValid::clValid(mall_scaled, 2:10, clMethods = methods, validation =
## "internal"): rownames for data not specified, using 1:nrow(data)
# Summarize the results
summary(results)
##
## Clustering Methods:
## hierarchical kmeans pam
##
## Cluster sizes:
## 2 3 4 5 6 7 8 9 10
##
## Validation Measures:
## 2 3 4 5 6 7 8 9 10
##
## hierarchical Connectivity 11.5218 16.0488 16.0488 21.6802 24.4683 30.0873 37.3337 42.8595 50.8476
## Dunn 0.0920 0.0926 0.1007 0.1216 0.1216 0.1262 0.1262 0.1538 0.1304
## Silhouette 0.3249 0.3400 0.3839 0.4096 0.3896 0.3800 0.3756 0.4069 0.3954
## kmeans Connectivity 15.2317 29.3147 34.4337 33.1044 34.9714 38.8690 45.2663 54.4786 65.6397
## Dunn 0.0596 0.0445 0.0593 0.0659 0.0554 0.0660 0.0673 0.1151 0.1452
## Silhouette 0.3355 0.3503 0.4040 0.4166 0.4274 0.4298 0.4171 0.4156 0.4002
## pam Connectivity 40.4341 23.7587 31.8710 32.8016 36.6397 42.2599 49.5706 64.0647 64.4107
## Dunn 0.0383 0.0683 0.0551 0.1005 0.0554 0.0554 0.0741 0.0660 0.0450
## Silhouette 0.3161 0.3588 0.4004 0.3667 0.4253 0.4137 0.3798 0.3735 0.3866
##
## Optimal Scores:
##
## Score Method Clusters
## Connectivity 11.5218 hierarchical 2
## Dunn 0.1538 hierarchical 9
## Silhouette 0.4298 kmeans 7
# Create the list of clustering methods: methods
methods <- c("hierarchical", "kmeans", "pam")
# Compare clustering methods: results
results <- clValid(mall_scaled, 2:10, clMethods = methods, validation = "stability")
## Warning in clValid(mall_scaled, 2:10, clMethods = methods, validation =
## "stability"): rownames for data not specified, using 1:nrow(data)
# Summarize the results
summary(results)
##
## Clustering Methods:
## hierarchical kmeans pam
##
## Cluster sizes:
## 2 3 4 5 6 7 8 9 10
##
## Validation Measures:
## 2 3 4 5 6 7 8 9 10
##
## hierarchical APN 0.1064 0.1838 0.2506 0.2997 0.3282 0.3631 0.3804 0.3376 0.3452
## AD 2.1956 1.8880 1.7034 1.6045 1.5477 1.4783 1.4540 1.3322 1.3060
## ADM 1.0162 0.8303 0.8859 0.9028 0.9009 0.9023 0.8922 0.8005 0.7830
## FOM 0.9942 0.9878 0.9549 0.9468 0.9361 0.9313 0.9324 0.8960 0.8934
## kmeans APN 0.2547 0.2913 0.3127 0.3659 0.3464 0.3407 0.3480 0.3740 0.3924
## AD 2.0424 1.8111 1.6195 1.5768 1.4826 1.4025 1.3432 1.3095 1.2830
## ADM 0.7833 0.8359 0.8218 0.9040 0.9127 0.8658 0.8289 0.8232 0.8005
## FOM 0.9959 0.9791 0.9484 0.9427 0.9327 0.9240 0.9002 0.8893 0.8939
## pam APN 0.2095 0.3028 0.2872 0.3451 0.3816 0.3705 0.3732 0.4302 0.4512
## AD 1.9793 1.8171 1.5905 1.5410 1.4516 1.3800 1.3369 1.3232 1.2558
## ADM 0.6366 0.8874 0.7664 0.8463 0.8339 0.7942 0.7569 0.7856 0.7631
## FOM 0.9779 0.9789 0.9350 0.9386 0.9330 0.9371 0.9264 0.9328 0.8989
##
## Optimal Scores:
##
## Score Method Clusters
## APN 0.1064 hierarchical 2
## AD 1.2558 pam 10
## ADM 0.6366 pam 2
## FOM 0.8893 kmeans 9
# Plot 3D mall_scaled data
plot3D::scatter3D(x = mall_scaled[, 1], y = mall_scaled[, 2], z = mall_scaled[, 3], col = "blue")
# Get K-means centroids for K = 7 and add them to the plot
km_centers <- results@clusterObjs$kmeans$`7`$centers
plot3D::points3D(km_centers[, 1], km_centers[, 2], km_centers[, 3], col = "red", pch=20, add=TRUE, cex=2.5)
# Get PAM's medoids for K = 7 and add them to the plot
pam_idxs <- results@clusterObjs$pam$'7'$medoids
pam_med <- mall_scaled[pam_idxs, ]
plot3D::points3D(pam_med[, 1], pam_med[, 2], pam_med[, 3], col = "green", pch=20, add=TRUE, cex=2.5)
appsOld <- apps
apps <- appsOld %>%
select(Rating, Reviews, Installs, Type, Price, `Content Rating`) %>%
rename(Content=`Content Rating`) %>%
mutate(HasPositiveReviews=TRUE, Price=as.numeric(gsub('\\$', '', Price))) %>%
filter(complete.cases(.))
# Glimpse at the data
glimpse(apps)
## Observations: 9,366
## Variables: 7
## $ Rating <dbl> 4.1, 3.9, 4.7, 4.5, 4.3, 4.4, 3.8, 4.1, 4.4, 4.7...
## $ Reviews <dbl> 159, 967, 87510, 215644, 967, 167, 178, 36815, 1...
## $ Installs <chr> "10,000+", "500,000+", "5,000,000+", "50,000,000...
## $ Type <chr> "Free", "Free", "Free", "Free", "Free", "Free", ...
## $ Price <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ Content <chr> "Everyone", "Everyone", "Everyone", "Teen", "Eve...
## $ HasPositiveReviews <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, ...
# Identify near-zero-variance predictors: nzv
nzv <- caret::nearZeroVar(apps, names=TRUE)
print(nzv)
## [1] "Price" "HasPositiveReviews"
# Frequency of the HasPositiveReviews attribute
table(apps$HasPositiveReviews)
##
## TRUE
## 9366
# Frequency of the Price attribute
table(apps$Price)
##
## 0 0.99 1 1.2 1.29 1.49 1.5 1.59 1.61 1.7 1.75
## 8719 107 2 1 1 31 1 1 1 2 1
## 1.76 1.97 1.99 2 2.49 2.5 2.56 2.59 2.9 2.95 2.99
## 1 1 59 1 21 1 1 1 1 1 114
## 3.02 3.04 3.08 3.28 3.49 3.88 3.9 3.95 3.99 4.29 4.49
## 1 1 1 1 7 1 1 2 58 1 9
## 4.59 4.6 4.77 4.84 4.99 5.49 5.99 6.49 6.99 7.49 7.99
## 1 1 1 1 70 3 18 1 13 2 7
## 8.49 8.99 9 9.99 10 10.99 11.99 12.99 13.99 14 14.99
## 1 4 2 16 3 2 5 5 1 1 10
## 15.46 15.99 16.99 17.99 18.99 19.4 19.99 24.99 29.99 33.99 37.99
## 1 1 3 2 1 1 5 5 6 2 1
## 39.99 79.99 299.99 379.99 389.99 399.99 400
## 1 2 1 1 1 11 1
# Remove these features: apps_clean
apps_clean <- apps %>%
select(-HasPositiveReviews, -Price)
# Glimpse at the fifa data
# glimpse(fifa)
# Are there zero or near-zero variance features?
# nearZeroVar(fifa)
# Highly correlated predictors: cor_90plus
# (cor_90plus <- findCorrelation(cor(fifa), names = TRUE))
# Highly correlated predictors (>= 98%): cor_98plus
# (cor_98plus <- findCorrelation(cor(fifa), names = TRUE, cutoff = 0.98))
# Remove cor_90plus features: fifa_clean
# fifa_clean <- fifa %>%
# select(-cor_90plus)
# Train model on original scaled data: mdl_orig
# mdl_orig <- train(Club ~ ., data = team_train, method="svmLinear2", trControl = trainCtrl)
# Predict on original test data: orig_preds, orig_probs
# orig_preds <- predict(mdl_orig, team_test)
# orig_probs <- predict(mdl_orig, team_test, type="prob")
# Compute and print the confusion matrix: cm_orig
# (cm_orig <- confusionMatrix(orig_preds, team_test$Club))
# Compute and print AUC: auc_orig
# (auc_orig <- auc(team_test$Club == 'Real.Madrid', orig_probs$'Real.Madrid'))
# Transform training and test data: train_pca, test_pca
# pca <- preProcess(x = team_train[, -match("Club", names(team_train))], method = "pca")
# train_pca <- predict(pca, team_train)
# test_pca <- predict(pca, team_test)
# Train model on PCA data: mdl_pca
# mdl_pca <- train(Club ~ ., data = train_pca, method = "svmLinear2", trControl = trainCtrl)
# Predict on PCA data: pca_preds, pca_probs
# pca_preds <- predict(mdl_pca, test_pca)
# pca_probs <- predict(mdl_pca, test_pca, type = "prob")
# Compute and print confusion matrix & AUC: cm_pca, auc_pca
# (cm_pca <- confusionMatrix(pca_preds, test_pca$Club))
# (auc_pca <- auc(test_pca$Club == 'Real.Madrid', pca_probs$'Real.Madrid'))
# Transform training and test data: train_lda, test_lda
# my_lda <- lda(Club ~ ., data = team_train)
# train_lda <- as.data.frame(predict(my_lda, team_train))
# test_lda <- as.data.frame(predict(my_lda, team_test))
# Train model on LDA-preprocessed data: mdl_lda
# mdl_lda <- train(class ~ ., data = train_lda, method="svmLinear2", trControl = trainCtrl)
# Predict on LDA-ed test data: lda_preds, lda_probs
# lda_preds <- predict(mdl_lda, test_lda)
# lda_probs <- predict(mdl_lda, test_lda, type="prob")
# Compute and print confusion matrix & AUC: cm_lda, auc_lda
# (cm_lda <- confusionMatrix(lda_preds, test_lda$class))
# (auc_lda <- auc(test_lda$class == 'Real.Madrid', lda_probs$Real.Madrid))
Chapter 4 - Model Evaluation
Model Evaluation:
Handling Imbalanced Data:
Hyperparameter Tuning:
Random Forests or Gradient Boosted Trees:
Wrap Up:
Example code includes:
apps <- appsOld %>%
select(Category, Rating, Reviews, Size, Installs, `Content Rating`) %>%
rename(Content.Rating=`Content Rating`) %>%
filter(complete.cases(.), Category %in% c("EDUCATION", "ENTERTAINMENT"),
Size!="Varies with device"
) %>%
mutate(Category=factor(Category), Installs=factor(Installs), Content.Rating=factor(Content.Rating))
appSize <- rep(NA, nrow(apps))
mbSize <- grep("^[0-9][0-9\\.]*M", apps$Size)
kbSize <- grep("^[0-9][0-9\\.]*k", apps$Size)
appSize[mbSize] <- as.numeric(gsub('M', '', apps$Size[mbSize]))
appSize[kbSize] <- as.numeric(gsub('k', '', apps$Size[kbSize])) / 1000
apps$Size <- appSize
glimpse(apps)
## Observations: 200
## Variables: 6
## $ Category <fct> EDUCATION, EDUCATION, EDUCATION, EDUCATION, EDUCATIO...
## $ Rating <dbl> 4.6, 4.7, 4.6, 4.7, 4.5, 4.7, 4.8, 4.6, 4.6, 4.6, 4....
## $ Reviews <dbl> 181893, 2544, 85375, 314299, 9770, 32346, 4075, 1061...
## $ Size <dbl> 18.0, 18.0, 21.0, 3.3, 39.0, 3.2, 5.1, 11.0, 27.0, 3...
## $ Installs <fct> "10,000,000+", "100,000+", "5,000,000+", "10,000,000...
## $ Content.Rating <fct> Everyone 10+, Everyone, Everyone, Everyone, Everyone...
set.seed(1912261548)
trIndex <- sort(sample(1:nrow(apps), round(0.75*nrow(apps)), replace=FALSE))
training <- apps[trIndex, ]
testing <- apps[-trIndex, ]
cv10 <- caret::trainControl(method="cv", number=10, classProbs=TRUE,
summaryFunction=caret::twoClassSummary
)
# Create KNN model: mdlKNN
set.seed(123)
mdlKNN <- train(Category ~ ., data = training, method = "knn", trControl = cv10, metric="ROC")
# Print the KNN model and its confusion matrix
print(mdlKNN)
## k-Nearest Neighbors
##
## 150 samples
## 5 predictor
## 2 classes: 'EDUCATION', 'ENTERTAINMENT'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 134, 135, 135, 135, 135, 135, ...
## Resampling results across tuning parameters:
##
## k ROC Sens Spec
## 5 0.6361111 0.7277778 0.5119048
## 7 0.6932044 0.7513889 0.6095238
## 9 0.6851190 0.7388889 0.5666667
##
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was k = 7.
ModelMetrics::confusionMatrix(predict(mdlKNN, testing), testing$Category)
## [,1] [,2]
## [1,] 0 0
## [2,] 0 28
# Predict class labels and probs: knn_preds, knn_probs
knn_preds <- predict(mdlKNN, newdata = testing)
knn_probs <- predict(mdlKNN, newdata = testing, type="prob")
# Print accuracy and AUC values
print(Metrics::accuracy(testing$Category, knn_preds))
## [1] 0.58
print(Metrics::auc(testing$Category == 'ENTERTAINMENT', knn_probs[, 2]))
## [1] 0.6067323
# Train SVM: mdlSVM
# set.seed(123)
# mdlSVM <- train(Overall ~ ., data = training, method = "svmRadial", trControl = cv10)
# Print the SVM model
# print(mdlSVM)
# Predict overall score on testing data: svm_preds
# svm_preds <- predict(mdlSVM, newdata = testing)
# Print RMSE and MAE values
# print(rmse(testing$Overall, svm_preds))
# print(mae(testing$Overall, svm_preds))
# Glimpse at the data
glimpse(mall_scaled)
## num [1:200, 1:3] -1.421 -1.278 -1.349 -1.135 -0.562 ...
## - attr(*, "dimnames")=List of 2
## ..$ : NULL
## ..$ : chr [1:3] "Age" "AnnualIncome" "SpendingScore"
## - attr(*, "scaled:center")= Named num [1:3] 38.9 60560 50.2
## ..- attr(*, "names")= chr [1:3] "Age" "AnnualIncome" "SpendingScore"
## - attr(*, "scaled:scale")= Named num [1:3] 14 26264.7 25.8
## ..- attr(*, "names")= chr [1:3] "Age" "AnnualIncome" "SpendingScore"
# Run DIANA: results
results <- clValid::clValid(mall_scaled, 2:10, clMethods = "diana", validation = "internal")
## Warning in clValid::clValid(mall_scaled, 2:10, clMethods = "diana", validation =
## "internal"): rownames for data not specified, using 1:nrow(data)
# Print and summarize results
print(results)
##
## Call:
## clValid::clValid(obj = mall_scaled, nClust = 2:10, clMethods = "diana",
## validation = "internal")
##
## Clustering Methods:
## diana
##
## Cluster sizes:
## 2 3 4 5 6 7 8 9 10
##
## Validation measures:
## Connectivity Dunn Silhouette
summary(results)
##
## Clustering Methods:
## diana
##
## Cluster sizes:
## 2 3 4 5 6 7 8 9 10
##
## Validation Measures:
## 2 3 4 5 6 7 8 9 10
##
## diana Connectivity 15.2317 22.5933 24.7599 29.3944 35.5631 42.6484 54.9845 62.9528 73.1131
## Dunn 0.0596 0.0275 0.0320 0.0401 0.0408 0.0434 0.0467 0.0483 0.0563
## Silhouette 0.3355 0.3524 0.3971 0.4161 0.4258 0.4188 0.3992 0.4012 0.3869
##
## Optimal Scores:
##
## Score Method Clusters
## Connectivity 15.2317 diana 2
## Dunn 0.0596 diana 2
## Silhouette 0.4258 diana 6
# Plot results
plot(results)
# Glimpse at the data
# glimpse(pulsar)
# Is there a class imbalance?
# table(pulsar$target_class)
# Set seed and partition data
# set.seed(123)
# inTrain <- createDataPartition(y = pulsar$target_class, p = .75, list = FALSE)
# training <- pulsar[inTrain,]
# testing <- pulsar[-inTrain,]
# Is there class imbalance in the training and test sets?
# table(training$target_class)
# table(testing$target_class)
trainDTree <- function(train_data, samplingMode = NULL) {
set.seed(123)
ctrl <- trainControl(method = "cv", number = 10, classProbs = TRUE,
summaryFunction = twoClassSummary, sampling = samplingMode
)
train(target_class ~ ., data = train_data, method = "rpart", metric = "ROC", trControl = ctrl)
}
# Train and print model with no subsampling: mdl_orig
# (mdl_orig <- trainDTree(training))
# Train model with downsampling: mdl_down
# (mdl_down <- trainDTree(training, samplingMode = "down"))
# Train model with upsampling: mdl_up
# (mdl_up <- trainDTree(training, samplingMode = "up"))
# Train model with SMOTE: mdl_smote
# (mdl_smote <- trainDTree(training, samplingMode = "smote"))
get_auc <- function(model, data) {
library(Metrics)
preds <- predict(model, data, type = "prob")[, "yes"]
auc(data$target_class == "yes", preds)
}
# Create model list: mdl_list
# mdl_list <- list(orig = mdl_orig, down = mdl_down, up = mdl_up, smote = mdl_smote)
# Compute AUC on training subsamples: resampling
# resampling <- resamples(mdl_list)
# summary(resampling, metric="ROC")
# Compute AUC on test data: auc_values
# auc_values <- sapply(mdl_list, FUN=get_auc, data = testing)
# print(auc_values)
set.seed(1912261602)
carIdx <- sort(sample(1:nrow(car), round(0.75*nrow(car)), replace=FALSE))
car_train <- car[carIdx, ]
car_test <- car[-carIdx, ]
# Set up train control: trc
trc <- caret::trainControl(method = "repeatedcv", number = 3, repeats = 5)
# Train model: svmr
svmr <- caret::train(consume ~ ., data = car_train, method = "svmRadial", trControl = trc)
# Print and plot SVM model
print(svmr)
## Support Vector Machines with Radial Basis Function Kernel
##
## 291 samples
## 5 predictor
##
## No pre-processing
## Resampling: Cross-Validated (3 fold, repeated 5 times)
## Summary of sample sizes: 194, 193, 195, 193, 194, 195, ...
## Resampling results across tuning parameters:
##
## C RMSE Rsquared MAE
## 0.25 16.60917 0.07026194 9.817553
## 0.50 16.56663 0.07598980 9.836894
## 1.00 16.69086 0.07013250 9.949751
##
## Tuning parameter 'sigma' was held constant at a value of 0.333598
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.333598 and C = 0.5.
plot(svmr)
# Set up train control: trc
trc <- caret::trainControl(method = "cv", number = 10)
# Create custom hyperparameter grid: hp_grid
hp_grid <- expand.grid(C = seq(from=0.2, to=1.0, by=0.2), sigma = c(0.35, 0.6, 0.75))
# Train model: svmr
svmr <- caret::train(consume ~ ., data = car_train, method = "svmRadial", trControl = trc, tuneGrid = hp_grid)
# Print and plot SVM model
print(svmr)
## Support Vector Machines with Radial Basis Function Kernel
##
## 291 samples
## 5 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 261, 261, 263, 262, 262, 263, ...
## Resampling results across tuning parameters:
##
## C sigma RMSE Rsquared MAE
## 0.2 0.35 16.26445 0.1133446 9.805302
## 0.2 0.60 16.29272 0.1095463 9.803260
## 0.2 0.75 16.27141 0.1114307 9.776178
## 0.4 0.35 16.28267 0.1171166 9.834590
## 0.4 0.60 16.26987 0.1189881 9.818069
## 0.4 0.75 16.25134 0.1215314 9.797508
## 0.6 0.35 16.28707 0.1206414 9.842559
## 0.6 0.60 16.27261 0.1222429 9.864889
## 0.6 0.75 16.29126 0.1183495 9.855399
## 0.8 0.35 16.32585 0.1222056 9.876548
## 0.8 0.60 16.32531 0.1188580 9.907451
## 0.8 0.75 16.36181 0.1113415 9.907735
## 1.0 0.35 16.37773 0.1204052 9.913318
## 1.0 0.60 16.42585 0.1111526 9.966089
## 1.0 0.75 16.44576 0.1042592 9.990756
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.75 and C = 0.4.
plot(svmr)
# Set random seed
set.seed(42)
# Set up train control: trc
trc <- caret::trainControl(method = "cv", number = 10, search = "random")
# Train model: svmr
svmr <- caret::train(consume ~ ., data = car_train, method = "svmRadial", trControl = trc, tuneLength = 10)
# Print and plot SVM model
print(svmr)
## Support Vector Machines with Radial Basis Function Kernel
##
## 291 samples
## 5 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 263, 262, 262, 262, 261, 262, ...
## Resampling results across tuning parameters:
##
## sigma C RMSE Rsquared MAE
## 0.01863605 139.97515122 16.96691 0.09870029 10.298489
## 0.01913389 0.04360606 16.67347 0.08400858 10.220681
## 0.03218316 182.01960806 17.35699 0.08326584 10.708358
## 0.04128325 447.99451171 18.81305 0.08272349 11.582198
## 0.05397889 0.11703741 16.49487 0.09765690 9.931373
## 0.06597003 40.93533689 17.42050 0.07540061 10.627318
## 0.07371349 4.52222667 16.56755 0.11544558 9.879941
## 0.23347673 26.75634732 18.13670 0.09849218 11.209528
## 0.35231334 0.49924112 16.44355 0.10131733 9.707891
## 1.30041593 511.66463964 27.86139 0.03973164 18.933067
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.3523133 and C = 0.4992411.
plot(svmr)
# Train the RF model: mdlRF
mdlRF <- randomForest::randomForest(formula = Rating ~ ., data = training, ntree = 500)
# Print the RF model
print(mdlRF)
##
## Call:
## randomForest(formula = Rating ~ ., data = training, ntree = 500)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 1
##
## Mean of squared residuals: 0.05913992
## % Var explained: 26.65
# RF variable importance
randomForest::varImpPlot(mdlRF)
print(mdlRF$importance)
## IncNodePurity
## Category 0.9254966
## Reviews 1.9137065
## Size 1.4592641
## Installs 1.4008747
## Content.Rating 0.9916575
# Train a GBM model with 500 trees: mdlGBM
mdlGBM <- gbm::gbm(formula = Rating ~ ., data = training, n.trees = 500)
## Distribution not specified, assuming gaussian ...
# Print GBM model
print(mdlGBM)
## gbm::gbm(formula = Rating ~ ., data = training, n.trees = 500)
## A gradient boosted model with gaussian loss function.
## 500 iterations were performed.
## There were 5 predictors of which 5 had non-zero influence.
# Summarize GBM's variable importance
summary(mdlGBM)
## var rel.inf
## Installs Installs 36.908130
## Reviews Reviews 35.318460
## Size Size 16.759659
## Content.Rating Content.Rating 5.577588
## Category Category 5.436162
# Predict on the testing data: gbm_preds, rf_preds
gbm_preds <- predict(mdlGBM, n.trees = 500, newdata = testing)
rf_preds <- predict(mdlRF, newdata = testing)
# RMSE metric for both models: gbm_rmse, rf_rmse
(gbm_rmse <- Metrics::rmse(testing$Rating, gbm_preds))
## [1] 0.2979939
(rf_rmse <- Metrics::rmse(testing$Rating, rf_preds))
## [1] 0.2866548
# RRSE metric for both models: gbm_rrse, rf_rrse
(gbm_rrse <- Metrics::rrse(testing$Rating, gbm_preds))
## [1] 0.9146599
(rf_rrse <- Metrics::rrse(testing$Rating, rf_preds))
## [1] 0.8798557
Chapter 1 - True Fundamentals
Regular Expression Basics:
Tokenization:
Text Cleaning Basics:
Example code includes:
text <- c("John's favorite color two colors are blue and red.", "John's favorite number is 1111.", 'John lives at P Sherman, 42 Wallaby Way, Sydney', 'He is 7 feet tall', 'John has visited 30 countries', 'John only has nine fingers.', 'John has worked at eleven different jobs', 'He can speak 3 languages', "john's favorite food is pizza", 'John can name 10 facts about himself.')
# Print off each item that contained a numeric number
grep(pattern = "\\d", x = text, value = TRUE)
## [1] "John's favorite number is 1111."
## [2] "John lives at P Sherman, 42 Wallaby Way, Sydney"
## [3] "He is 7 feet tall"
## [4] "John has visited 30 countries"
## [5] "He can speak 3 languages"
## [6] "John can name 10 facts about himself."
# Find all items with a number followed by a space
grep(pattern = "\\d\\s", x = text)
## [1] 3 4 5 8 10
# How many times did you write down 'favorite'?
length(grep(pattern = "favorite", x = text))
## [1] 3
# Print off the text for every time you used your boss's name, John
grep('John', x = text, value = TRUE)
## [1] "John's favorite color two colors are blue and red."
## [2] "John's favorite number is 1111."
## [3] "John lives at P Sherman, 42 Wallaby Way, Sydney"
## [4] "John has visited 30 countries"
## [5] "John only has nine fingers."
## [6] "John has worked at eleven different jobs"
## [7] "John can name 10 facts about himself."
# Try replacing all occurences of "John" with "He"
gsub(pattern = 'John', replacement = 'He ', x = text)
## [1] "He 's favorite color two colors are blue and red."
## [2] "He 's favorite number is 1111."
## [3] "He lives at P Sherman, 42 Wallaby Way, Sydney"
## [4] "He is 7 feet tall"
## [5] "He has visited 30 countries"
## [6] "He only has nine fingers."
## [7] "He has worked at eleven different jobs"
## [8] "He can speak 3 languages"
## [9] "john's favorite food is pizza"
## [10] "He can name 10 facts about himself."
# Replace all occurences of "John " with 'He '.
clean_text <- gsub(pattern = 'John\\s', replacement = 'He ', x = text)
clean_text
## [1] "John's favorite color two colors are blue and red."
## [2] "John's favorite number is 1111."
## [3] "He lives at P Sherman, 42 Wallaby Way, Sydney"
## [4] "He is 7 feet tall"
## [5] "He has visited 30 countries"
## [6] "He only has nine fingers."
## [7] "He has worked at eleven different jobs"
## [8] "He can speak 3 languages"
## [9] "john's favorite food is pizza"
## [10] "He can name 10 facts about himself."
# Replace all occurences of "John's" with 'His'
gsub(pattern = "John\\'s", replacement = 'His', x = clean_text)
## [1] "His favorite color two colors are blue and red."
## [2] "His favorite number is 1111."
## [3] "He lives at P Sherman, 42 Wallaby Way, Sydney"
## [4] "He is 7 feet tall"
## [5] "He has visited 30 countries"
## [6] "He only has nine fingers."
## [7] "He has worked at eleven different jobs"
## [8] "He can speak 3 languages"
## [9] "john's favorite food is pizza"
## [10] "He can name 10 facts about himself."
animal_farm <- read_csv("./RInputFiles/animal_farm.csv")
## Parsed with column specification:
## cols(
## chapter = col_character(),
## text_column = col_character()
## )
str(animal_farm)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 10 obs. of 2 variables:
## $ chapter : chr "Chapter 1" "Chapter 2" "Chapter 3" "Chapter 4" ...
## $ text_column: chr "Mr. Jones, of the Manor Farm, had locked the hen-houses for the night, but was too drunk to remember to shut th"| __truncated__ "Three nights later old Major died peacefully in his sleep. His body was buried at the foot of the orchard.This "| __truncated__ "How they toiled and sweated to get the hay in! But their efforts were rewarded, for the harvest was an even big"| __truncated__ "By the late summer the news of what had happened on Animal Farm had spread across half the county. Every day Sn"| __truncated__ ...
## - attr(*, "spec")=
## .. cols(
## .. chapter = col_character(),
## .. text_column = col_character()
## .. )
# Split the text_column into sentences
animal_farm %>%
tidytext::unnest_tokens(output = "sentences", input = text_column, token = "sentences") %>%
# Count sentences, per chapter
count(chapter)
## # A tibble: 10 x 2
## chapter n
## <chr> <int>
## 1 Chapter 1 136
## 2 Chapter 10 167
## 3 Chapter 2 140
## 4 Chapter 3 114
## 5 Chapter 4 84
## 6 Chapter 5 158
## 7 Chapter 6 136
## 8 Chapter 7 190
## 9 Chapter 8 203
## 10 Chapter 9 195
# Split the text_column using regular expressions
animal_farm %>%
tidytext::unnest_tokens(output = "sentences", input = text_column, token = "regex", pattern = "\\.") %>%
count(chapter)
## # A tibble: 10 x 2
## chapter n
## <chr> <int>
## 1 Chapter 1 131
## 2 Chapter 10 179
## 3 Chapter 2 150
## 4 Chapter 3 113
## 5 Chapter 4 92
## 6 Chapter 5 158
## 7 Chapter 6 127
## 8 Chapter 7 188
## 9 Chapter 8 200
## 10 Chapter 9 174
# Tokenize animal farm's text_column column
tidy_animal_farm <- animal_farm %>%
tidytext::unnest_tokens(word, text_column)
# Print the word frequencies
tidy_animal_farm %>%
count(word, sort = TRUE)
## # A tibble: 4,076 x 2
## word n
## <chr> <int>
## 1 the 2187
## 2 and 966
## 3 of 899
## 4 to 814
## 5 was 633
## 6 a 620
## 7 in 537
## 8 had 529
## 9 that 451
## 10 it 384
## # ... with 4,066 more rows
# Remove stop words, using stop_words from tidytext
str(tidy_animal_farm)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 30037 obs. of 2 variables:
## $ chapter: chr "Chapter 1" "Chapter 1" "Chapter 1" "Chapter 1" ...
## $ word : chr "mr" "jones" "of" "the" ...
## - attr(*, "spec")=
## .. cols(
## .. chapter = col_character(),
## .. text_column = col_character()
## .. )
tidy_animal_farm <- tidy_animal_farm %>%
anti_join(tidytext::stop_words)
## Joining, by = "word"
str(tidy_animal_farm)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 10579 obs. of 2 variables:
## $ chapter: chr "Chapter 1" "Chapter 1" "Chapter 1" "Chapter 1" ...
## $ word : chr "jones" "manor" "farm" "locked" ...
## - attr(*, "spec")=
## .. cols(
## .. chapter = col_character(),
## .. text_column = col_character()
## .. )
# Perform stemming on tidy_animal_farm
stemmed_animal_farm <- tidy_animal_farm %>%
mutate(word = SnowballC::wordStem(word))
# Print the old word frequencies
tidy_animal_farm %>%
count(word, sort = TRUE)
## # A tibble: 3,611 x 2
## word n
## <chr> <int>
## 1 animals 248
## 2 farm 163
## 3 napoleon 141
## 4 animal 107
## 5 snowball 106
## 6 pigs 91
## 7 boxer 76
## 8 time 71
## 9 windmill 68
## 10 squealer 61
## # ... with 3,601 more rows
# Print the new word frequencies
stemmed_animal_farm %>%
count(word, sort = TRUE)
## # A tibble: 2,751 x 2
## word n
## <chr> <int>
## 1 anim 363
## 2 farm 173
## 3 napoleon 141
## 4 pig 114
## 5 snowbal 106
## 6 comrad 94
## 7 dai 86
## 8 time 83
## 9 boxer 76
## 10 windmil 70
## # ... with 2,741 more rows
Chapter 2 - Representations of Text
Understanding an R Corpus:
Bag-of-words Representation:
TFIDF - Term Frequency Inverse Document Frequency:
Cosine Similarity:
Example code includes:
crudeText <- c('Diamond Shamrock Corp said that\neffective today it had cut its contract prices for crude oil by\n1.50 dlrs a barrel.\n The reduction brings its posted price for West Texas\nIntermediate to 16.00 dlrs a barrel, the copany said.\n \"The price reduction today was made in the light of falling\noil product prices and a weak crude oil market,\" a company\nspokeswoman said.\n Diamond is the latest in a line of U.S. oil companies that\nhave cut its contract, or posted, prices over the last two days\nciting weak oil markets.\n Reuter')
crudeText <- c(crudeText, 'OPEC may be forced to meet before a\nscheduled June session to readdress its production cutting\nagreement if the organization wants to halt the current slide\nin oil prices, oil industry analysts said.\n \"The movement to higher oil prices was never to be as easy\nas OPEC thought. They may need an emergency meeting to sort out\nthe problems,\" said Daniel Yergin, director of Cambridge Energy\nResearch Associates, CERA.\n Analysts and oil industry sources said the problem OPEC\nfaces is excess oil supply in world oil markets.\n \"OPECs problem is not a price problem but a production\nissue and must be addressed in that way,\" said Paul Mlotok, oil\nanalyst with Salomon Brothers Inc.\n He said the markets earlier optimism about OPEC and its\nability to keep production under control have given way to a\npessimistic outlook that the organization must address soon if\nit wishes to regain the initiative in oil prices.\n But some other analysts were uncertain that even an\nemergency meeting would address the problem of OPEC production\nabove the 15.8 mln bpd quota set last December.\n \"OPEC has to learn that in a buyers market you cannot have\ndeemed quotas, fixed prices and set differentials,\" said the\nregional manager for one of the major oil companies who spoke\non condition that he not be named. \"The market is now trying to\nteach them that lesson again,\" he added.\n David T. Mizrahi, editor of Mideast reports, expects OPEC\nto meet before June, although not immediately. However, he is\nnot optimistic that OPEC can address its principal problems.\n \"They will not meet now as they try to take advantage of the\nwinter demand to sell their oil, but in late March and April\nwhen demand slackens,\" Mizrahi said.\n But Mizrahi said that OPEC is unlikely to do anything more\nthan reiterate its agreement to keep output at 15.8 mln bpd.\"\n Analysts said that the next two months will be critical for\nOPECs ability to hold together prices and output.\n \"OPEC must hold to its pact for the next six to eight weeks\nsince buyers will come back into the market then,\" said Dillard\nSpriggs of Petroleum Analysis Ltd in New York.\n But Bijan Moussavar-Rahmani of Harvard Universitys Energy\nand Environment Policy Center said that the demand for OPEC oil\nhas been rising through the first quarter and this may have\nprompted excesses in its production.\n \"Demand for their (OPEC) oil is clearly above 15.8 mln bpd\nand is probably closer to 17 mln bpd or higher now so what we\nare seeing characterized as cheating is OPEC meeting this\ndemand through current production,\" he told Reuters in a\ntelephone interview.\n Reuter')
crudeText <- c(crudeText, 'Texaco Canada said it lowered the\ncontract price it will pay for crude oil 64 Canadian cts a\nbarrel, effective today.\n The decrease brings the companys posted price for the\nbenchmark grade, Edmonton/Swann Hills Light Sweet, to 22.26\nCanadian dlrs a bbl.\n Texaco Canada last changed its crude oil postings on Feb\n19.\n Reuter')
crudeText <- c(crudeText, 'Marathon Petroleum Co said it reduced\nthe contract price it will pay for all grades of crude oil one\ndlr a barrel, effective today.\n The decrease brings Marathons posted price for both West\nTexas Intermediate and West Texas Sour to 16.50 dlrs a bbl. The\nSouth Louisiana Sweet grade of crude was reduced to 16.85 dlrs\na bbl.\n The company last changed its crude postings on Jan 12.\n Reuter')
crudeText <- c(crudeText, 'Houston Oil Trust said that independent\npetroleum engineers completed an annual study that estimates\nthe trusts future net revenues from total proved reserves at\n88 mln dlrs and its discounted present value of the reserves at\n64 mln dlrs.\n Based on the estimate, the trust said there may be no money\navailable for cash distributions to unitholders for the\nremainder of the year.\n It said the estimates reflect a decrease of about 44 pct in\nnet reserve revenues and 39 pct in discounted present value\ncompared with the study made in 1985.\n Reuter')
crudeText <- c(crudeText, 'Kuwait\"s Oil Minister, in remarks\npublished today, said there were no plans for an emergency OPEC\nmeeting to review oil policies after recent weakness in world\noil prices.\n Sheikh Ali al-Khalifa al-Sabah was quoted by the local\ndaily al-Qabas as saying: \"None of the OPEC members has asked\nfor such a meeting.\"\n He denied Kuwait was pumping above its quota of 948,000\nbarrels of crude daily (bpd) set under self-imposed production\nlimits of the 13-nation organisation.\n Traders and analysts in international oil markets estimate\nOPEC is producing up to one mln bpd above a ceiling of 15.8 mln\nbpd agreed in Geneva last December.\n They named Kuwait and the United Arab Emirates, along with\nthe much smaller producer Ecuador, among those producing above\nquota. Kuwait, they said, was pumping 1.2 mln bpd.\n \"This rumour is baseless. It is based on reports which said\nKuwait has the ability to exceed its share. They suppose that\nbecause Kuwait has the ability, it will do so,\" the minister\nsaid.\n Sheikh Ali has said before that Kuwait had the ability to\nproduce up to 4.0 mln bpd.\n \"If we can sell more than our quota at official prices,\nwhile some countries are suffering difficulties marketing their\nshare, it means we in Kuwait are unusually clever,\" he said.\n He was referring apparently to the Gulf state of qatar,\nwhich industry sources said was selling less than 180,000 bpd\nof its 285,000 bpd quota, because buyers were resisting\nofficial prices restored by OPEC last month pegged to a marker\nof 18 dlrs per barrel.\n Prices in New York last week dropped to their lowest levels\nthis year and almost three dollars below a three-month high of\n19 dollars a barrel.\n Sheikh Ali also delivered \"a challenge to any international\noil company that declared Kuwait sold below official prices.\"\n Because it was charging its official price, of 16.67 dlrs a\nbarrel, it had lost custom, he said but did not elaborate.\n However, Kuwait had guaranteed markets for its oil because\nof its local and international refining facilities and its own\ndistribution network abroad, he added.\n He reaffirmed that the planned meeting March 7 of OPEC\"s\ndifferentials committee has been postponed until the start of\nApril at the request of certain of the body\"s members.\n Ecuador\"s deputy energy minister Fernando Santos Alvite said\nlast Wednesday his debt-burdened country wanted OPEC to assign\na lower official price for its crude, and was to seek this at\ntalks this month of opec\"s pricing committee.\n Referring to pressure by oil companies on OPEC members, in\napparent reference to difficulties faced by Qatar, he said: \"We\nexpected such pressure. It will continue through March and\nApril.\" But he expected the situation would later improve.\n REUTER')
crudeText <- c(crudeText, 'Indonesia appears to be nearing a\npolitical crossroads over measures to deregulate its protected\neconomy, the U.S. Embassy says in a new report.\n To counter falling oil revenues, the government has\nlaunched a series of measures over the past nine months to\nboost exports outside the oil sector and attract new\ninvestment.\n Indonesia, the only Asian member of OPEC and a leading\nprimary commodity producer, has been severely hit by last year\"s\nfall in world oil prices, which forced it to devalue its\ncurrency by 31 pct in September.\n But the U.S. Embassy report says President Suharto\"s\ngovernment appears to be divided over what direction to lead\nthe economy.\n \"(It) appears to be nearing a crossroads with regard to\nderegulation, both as it pertains to investments and imports,\"\nthe report says. It primarily assesses Indonesia\"s agricultural\nsector, but also reviews the country\"s general economic\nperformance.\n It says that while many government officials and advisers\nare recommending further relaxation, \"there are equally strong\npressures being exerted to halt all such moves.\"\n \"This group strongly favours an import substitution economy,\"\nthe report says.\n Indonesia\"s economic changes have been welcomed by the World\nBank and international bankers as steps in the right direction,\nthough they say crucial areas of the economy like plastics and\nsteel remain highly protected, and virtual monopolies.\n Three sets of measures have been announced since last May,\nwhich broadened areas for foreign investment, reduced trade\nrestrictions and liberalised imports.\n The report says Indonesia\"s economic growth in calendar 1986\nwas probably about zero, and the economy may even have\ncontracted a bit. \"This is the lowest rate of growth since the\nmid-1960s,\" the report notes.\n Indonesia, the largest country in South-East Asia with a\npopulation of 168 million, is facing general elections in\nApril.\n But the report hold out little hope for swift improvement\nin the economic outlook. \"For 1987 early indications point to a\nslightly positive growth rate not exceeding one pct. Economic\nactivity continues to suffer due to the sharp fall in export\nearnings from the petroleum industry.\"\n \"Growth in the non-oil sector is low because of weak\ndomestic demand coupled with excessive plant capacity, real\ndeclines in construction and trade, and a reduced level of\ngrowth in agriculture,\" the report states.\n Bankers say continuation of present economic reforms is\ncrucial for the government to get the international lending its\nneeds.\n A new World Bank loan of 300 mln dlrs last month in balance\nof payments support was given partly to help the government\nmaintain the momentum of reform, the Bank said.\n REUTER')
crudeText <- c(crudeText, 'Saudi riyal interbank deposits were\nsteady at yesterdays higher levels in a quiet market.\n Traders said they were reluctant to take out new positions\namidst uncertainty over whether OPEC will succeed in halting\nthe current decline in oil prices.\n Oil industry sources said yesterday several Gulf Arab\nproducers had had difficulty selling oil at official OPEC\nprices but Kuwait has said there are no plans for an emergency\nmeeting of the 13-member organisation.\n A traditional Sunday lull in trading due to the European\nweekend also contributed to the lack of market activity.\n Spot-next and one-week rates were put at 6-1/4, 5-3/4 pct\nafter quotes ranging between seven, six yesterday.\n One, three, and six-month deposits were quoted unchanged at\n6-5/8, 3/8, 7-1/8, 6-7/8 and 7-3/8, 1/8 pct respectively.\n The spot riyal was quietly firmer at 3.7495/98 to the\ndollar after quotes of 3.7500/03 yesterday.\n REUTER')
crudeText <- c(crudeText, 'The Gulf oil state of Qatar, recovering\nslightly from last years decline in world oil prices,\nannounced its first budget since early 1985 and projected a\ndeficit of 5.472 billion riyals.\n The deficit compared with a shortfall of 7.3 billion riyals\nin the last published budget for 1985/86.\n In a statement outlining the budget for the fiscal year\n1987/88 beginning today, Finance and Petroleum Minister Sheikh\nAbdul-Aziz bin Khalifa al-Thani said the government expected to\nspend 12.217 billion riyals in the period.\n Projected expenditure in the 1985/86 budget had been 15.6\nbillion riyals.\n Sheikh Abdul-Aziz said government revenue would be about\n6.745 billion riyals, down by about 30 pct on the 1985/86\nprojected revenue of 9.7 billion.\n The government failed to publish a 1986/87 budget due to\nuncertainty surrounding oil revenues.\n Sheikh Abdul-Aziz said that during that year the government\ndecided to limit recurrent expenditure each month to\none-twelfth of the previous fiscal years allocations minus 15\npct.\n He urged heads of government departments and public\ninstitutions to help the government rationalise expenditure. He\ndid not say how the 1987/88 budget shortfall would be covered.\n Sheikh Abdul-Aziz said plans to limit expenditure in\n1986/87 had been taken in order to relieve the burden placed on\nthe countrys foreign reserves.\n He added in 1987/88 some 2.766 billion riyals had been\nallocated for major projects including housing and public\nbuildings, social services, health, education, transport and\ncommunications, electricity and water, industry and\nagriculture.\n No figure was revealed for expenditure on defence and\nsecurity. There was also no projection for oil revenue.\n Qatar, an OPEC member, has an output ceiling of 285,000\nbarrels per day.\n Sheikh Abdul-Aziz said: \"Our expectations of positive signs\nregarding (oil) price trends, foremost among them OPECs\ndetermination to shoulder its responsibilites and protect its\nwealth, have helped us make reasonable estimates for the coming\nyears revenue on the basis of our assigned quota.\"\n REUTER')
crudeText <- c(crudeText, 'Saudi Arabian Oil Minister Hisham Nazer\nreiterated the kingdoms commitment to last Decembers OPEC\naccord to boost world oil prices and stabilise the market, the\nofficial Saudi Press Agency SPA said.\n Asked by the agency about the recent fall in free market\noil prices, Nazer said Saudi Arabia \"is fully adhering by the\n... Accord and it will never sell its oil at prices below the\npronounced prices under any circumstance.\"\n Nazer, quoted by SPA, said recent pressure on free market\nprices \"may be because of the end of the (northern hemisphere)\nwinter season and the glut in the market.\"\n Saudi Arabia was a main architect of the December accord,\nunder which OPEC agreed to lower its total output ceiling by\n7.25 pct to 15.8 mln barrels per day (bpd) and return to fixed\nprices of around 18 dlrs a barrel.\n The agreement followed a year of turmoil on oil markets,\nwhich saw prices slump briefly to under 10 dlrs a barrel in\nmid-1986 from about 30 dlrs in late 1985. Free market prices\nare currently just over 16 dlrs.\n Nazer was quoted by the SPA as saying Saudi Arabias\nadherence to the accord was shown clearly in the oil market.\n He said contacts among members of OPEC showed they all\nwanted to stick to the accord.\n In Jamaica, OPEC President Rilwanu Lukman, who is also\nNigerian Oil Minister, said the group planned to stick with the\npricing agreement.\n \"We are aware of the negative forces trying to manipulate\nthe operations of the market, but we are satisfied that the\nfundamentals exist for stable market conditions,\" he said.\n Kuwaits Oil Minister, Sheikh Ali al-Khalifa al-Sabah, said\nin remarks published in the emirates daily Al-Qabas there were\nno plans for an emergency OPEC meeting to review prices.\n Traders and analysts in international oil markets estimate\nOPEC is producing up to one mln bpd above the 15.8 mln ceiling.\n They named Kuwait and the United Arab Emirates, along with\nthe much smaller producer Ecuador, among those producing above\nquota. Sheikh Ali denied that Kuwait was over-producing.\n REUTER')
crudeText <- c(crudeText, 'Saudi crude oil output last month fell\nto an average of 3.5 mln barrels per day (bpd) from 3.8 mln bpd\nin January, Gulf oil sources said.\n They said exports from the Ras Tanurah and Juaymah\nterminals in the Gulf fell to an average 1.9 mln bpd last month\nfrom 2.2 mln in January because of lower liftings by some\ncustomers.\n But the drop was much smaller than expected after Gulf\nexports rallied in the fourth week of February to 2.5 mln bpd\nfrom 1.2 mln in the third week, the sources said.\n The production figures include neutral zone output but not\nsales from floating storage, which are generally considered\npart of a countrys output for Opec purposes.\n Saudi Arabia has an Opec quota of 4.133 mln bpd under a\nproduction restraint scheme approved by the 13-nation group\nlast December to back new official oil prices averaging 18 dlrs\na barrel.\n The sources said the two-fold jump in exports last week\nappeared to be the result of buyers rushing to lift February\nentitlements before the month-end.\n Last weeks high export levels appeared to show continued\nsupport for official Opec prices from Saudi Arabias main crude\ncustomers, the four ex-partners of Aramco, the sources said.\n The four -- Exxon Corp <XON>, Mobil Corp <MOB>, Texaco Inc\n<TX> and Chevron Corp <CHV> -- signed a long-term agreement\nlast month to buy Saudi crude for 17.52 dlrs a barrel.\n However the sources said the real test of Saudi Arabias\nability to sell crude at official prices in a weak market will\ncome this month, when demand for petroleum products\ntraditionally tapers off. Spot prices have fallen in recent\nweeks to more than one dlr below Opec levels.\n Saudi Arabian oil minister Hisham Nazer yesterday\nreiterated the kingdoms commitment to the December OPEC accord\nand said it would never sell below official prices.\n The sources said total Saudi refinery throughput fell\nslightly in February to an average 1.1 mln bpd from 1.2 mln in\nJanuary because of cuts at the Yanbu and Jubail export\nrefineries.\n They put crude oil exports through Yanbu at 100,000 bpd\nlast month, compared to zero in January, while throughput at\nBahrains refinery and neutral zone production remained steady\nat around 200,000 bpd each.\n REUTER')
crudeText <- c(crudeText, 'Deputy oil ministers from six Gulf\nArab states will meet in Bahrain today to discuss coordination\nof crude oil marketing, the official Emirates news agency WAM\nreported.\n WAM said the officials would be discussing implementation\nof last Sundays agreement in Doha by Gulf Cooperation Council\n(GCC) oil ministers to help each other market their crude oil.\n Four of the GCC states - Saudi Arabia, the United Arab\nEmirates (UAE), Kuwait and Qatar - are members of the\nOrganiaation of Petroleum Exporting Countries (OPEC) and some\nface stiff buyer resistance to official OPEC prices.\n Reuter')
crudeText <- c(crudeText, 'Saudi Arabian Oil Minister Hisham Nazer\nreiterated the kingdoms commitment to last Decembers OPEC\naccord to boost world oil prices and stabilize the market, the\nofficial Saudi Press Agency SPA said.\n Asked by the agency about the recent fall in free market\noil prices, Nazer said Saudi Arabia \"is fully adhering by the\n... accord and it will never sell its oil at prices below the\npronounced prices under any circumstance.\"\n Saudi Arabia was a main architect of December pact under\nwhich OPEC agreed to cut its total oil output ceiling by 7.25\npct and return to fixed prices of around 18 dollars a barrel.\n Reuter')
crudeText <- c(crudeText, 'Kuwaits oil minister said in a newspaper\ninterview that there were no plans for an emergency OPEC\nmeeting after the recent weakness in world oil prices.\n Sheikh Ali al-Khalifa al-Sabah was quoted by the local\ndaily al-Qabas as saying that \"none of the OPEC members has\nasked for such a meeting.\"\n He also denied that Kuwait was pumping above its OPEC quota\nof 948,000 barrels of crude daily (bpd).\n Crude oil prices fell sharply last week as international\noil traders and analysts estimated the 13-nation OPEC was\npumping up to one million bpd over its self-imposed limits.\n Reuter')
crudeText <- c(crudeText, 'The port of Philadelphia was closed\nwhen a Cypriot oil tanker, Seapride II, ran aground after\nhitting a 200-foot tower supporting power lines across the\nriver, a Coast Guard spokesman said.\n He said there was no oil spill but the ship is lodged on\nrocks opposite the Hope Creek nuclear power plant in New\nJersey.\n He said the port would be closed until today when they\nhoped to refloat the ship on the high tide.\n After delivering oil to a refinery in Paulsboro, New\nJersey, the ship apparently lost its steering and hit the power\ntransmission line carrying power from the nuclear plant to the\nstate of Delaware.\n Reuter')
crudeText <- c(crudeText, 'A study group said the United States\nshould increase its strategic petroleum reserve to one mln\nbarrels as one way to deal with the present and future impact\nof low oil prices on the domestic oil industry.\n U.S. policy now is to raise the strategic reserve to 750\nmln barrels, from its present 500 mln, to help protect the\neconomy from an overseas embargo or a sharp price rise.\n The Aspen Institute for Humanistic Studies, a private\ngroup, also called for new research for oil exploration and\ndevelopment techniques.\n It predicted prices would remain at about 15-18 dlrs a\nbarrel for several years and then rise to the mid 20s, with\nimports at about 30 pct of U.S. consumption.\n It said instead that such moves as increasing oil reserves\nand more exploration and development research would help to\nguard against or mitigate the risks of increased imports.\n Reuter')
crudeText <- c(crudeText, 'A study group said the United States\nshould increase its strategic petroleum reserve to one mln\nbarrels as one way to deal with the present and future impact\nof low oil prices on the domestic oil industry.\n U.S. policy now is to raise the strategic reserve to 750\nmln barrels, from its present 500 mln, to help protect the\neconomy from an overseas embargo or a sharp price rise.\n The Aspen Institute for Humanistic Studies, a private\ngroup, also called for new research for oil exploration and\ndevelopment techniques.\n It predicted prices would remain at about 15-18 dlrs a\nbarrel for several years and then rise to the mid 20s, with\nimports at about 30 pct of U.S. consumption.\n The study cited two basic policy paths for the nation: to\nprotect the U.S. industry through an import fee or other such\ndevice or to accept the full economic benefits of cheap oil.\n But the group did not strongly back either option, saying\nthere were benefits and drawbacks to both.\n It said instead that such moves as increasing oil reserves\nand more exploration and development research would help to\nguard against or mitigate the risks of increased imports.\n Reuter')
crudeText <- c(crudeText, 'Unocal Corps Union Oil Co said it\nlowered its posted prices for crude oil one to 1.50 dlrs a\nbarrel in the eastern region of the U.S., effective Feb 26.\n Union said a 1.50 dlrs cut brings its posted price for the\nU.S. benchmark grade, West Texas Intermediate, to 16 dlrs.\nLouisiana Sweet also was lowered 1.50 dlrs to 16.35 dlrs, the\ncompany said.\n No changes were made in Unions posted prices for West\nCoast grades of crude oil, the company said.\n Reuter')
crudeText <- c(crudeText, 'The New York Mercantile Exchange set\nApril one for the debut of a new procedure in the energy\ncomplex that will increase the use of energy futures worldwide.\n On April one, NYMEX will allow oil traders that do not\nhold a futures position to initiate, after the exchange closes,\na transaction that can subsequently be hedged in the futures\nmarket, according to an exchange spokeswoman.\n \"This will change the way oil is transacted in the real\nworld,\" said said Thomas McKiernan, McKiernan and Co chairman.\n Foreign traders will be able to hedge trades against NYMEX\nprices before the exchange opens and negotiate prices at a\ndifferential to NYMEX prices, McKiernan explained.\n The expanded program \"will serve the industry because the\noil market does not close when NYMEX does,\" said Frank Capozza,\nsecretary of Century Resources Inc.\n The rule change, which has already taken effect for\nplatinum futures on NYMEX, is expected to increase the open\ninterest and liquidity in U.S. energy futures, according to\ntraders and analysts.\n Currently, at least one trader in this transaction, called\nan exchange for physical or EFP, must hold a futures position\nbefore entering into the transaction.\n Under the new arrangement, neither party has to hold a\nfutures position before entering into an EFP and one or both\nparties can offset their cash transaction with a futures\ncontract the next day, according to exchange officials.\n When NYMEX announced its proposed rule change in December,\nNYMEX President Rosemary McFadden, said, \"Expansion of the EFP\nprovision will add to globalization of the energy markets by\nproviding for, in effect, 24-hour trading.\"\n The Commodity Futures Trading Commission approved the rule\nchange in February, according to a CFTC spokeswoman.\n Reuter')
crudeText <- c(crudeText, 'Argentine crude oil production was\ndown 10.8 pct in January 1987 to 12.32 mln barrels, from 13.81\nmln barrels in January 1986, Yacimientos Petroliferos Fiscales\nsaid.\n January 1987 natural gas output totalled 1.15 billion cubic\nmetrers, 3.6 pct higher than 1.11 billion cubic metres produced\nin January 1986, Yacimientos Petroliferos Fiscales added.\n Reuter')
crude <- tm::VCorpus(tm::VectorSource(crudeText))
NLP::meta(crude, "id") <- c('127', '144', '191', '194', '211', '236', '237', '242', '246', '248', '273', '349', '352', '353', '368', '489', '502', '543', '704', '708')
# Print out the corpus
print(crude)
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 1
## Content: documents: 20
# Print the content of the 10th article
crude[[10]]$content
## [1] "Saudi Arabian Oil Minister Hisham Nazer\nreiterated the kingdoms commitment to last Decembers OPEC\naccord to boost world oil prices and stabilise the market, the\nofficial Saudi Press Agency SPA said.\n Asked by the agency about the recent fall in free market\noil prices, Nazer said Saudi Arabia \"is fully adhering by the\n... Accord and it will never sell its oil at prices below the\npronounced prices under any circumstance.\"\n Nazer, quoted by SPA, said recent pressure on free market\nprices \"may be because of the end of the (northern hemisphere)\nwinter season and the glut in the market.\"\n Saudi Arabia was a main architect of the December accord,\nunder which OPEC agreed to lower its total output ceiling by\n7.25 pct to 15.8 mln barrels per day (bpd) and return to fixed\nprices of around 18 dlrs a barrel.\n The agreement followed a year of turmoil on oil markets,\nwhich saw prices slump briefly to under 10 dlrs a barrel in\nmid-1986 from about 30 dlrs in late 1985. Free market prices\nare currently just over 16 dlrs.\n Nazer was quoted by the SPA as saying Saudi Arabias\nadherence to the accord was shown clearly in the oil market.\n He said contacts among members of OPEC showed they all\nwanted to stick to the accord.\n In Jamaica, OPEC President Rilwanu Lukman, who is also\nNigerian Oil Minister, said the group planned to stick with the\npricing agreement.\n \"We are aware of the negative forces trying to manipulate\nthe operations of the market, but we are satisfied that the\nfundamentals exist for stable market conditions,\" he said.\n Kuwaits Oil Minister, Sheikh Ali al-Khalifa al-Sabah, said\nin remarks published in the emirates daily Al-Qabas there were\nno plans for an emergency OPEC meeting to review prices.\n Traders and analysts in international oil markets estimate\nOPEC is producing up to one mln bpd above the 15.8 mln ceiling.\n They named Kuwait and the United Arab Emirates, along with\nthe much smaller producer Ecuador, among those producing above\nquota. Sheikh Ali denied that Kuwait was over-producing.\n REUTER"
# Find the first ID
crude[[1]]$meta$id
## [1] "1"
# Make a vector of IDs
ids <- c()
for(i in c(1:20)){
ids <- append(ids, crude[[i]]$meta$id)
}
# Create a tibble & Review
crude_tibble <- generics::tidy(crude)
names(crude_tibble)
## [1] "author" "datetimestamp" "description" "heading"
## [5] "id" "language" "origin" "text"
crude_counts <- crude_tibble %>%
# Tokenize
tidytext::unnest_tokens(word, text) %>%
# Count by word
count(word, sort = TRUE) %>%
# Remove
anti_join(tidytext::stop_words)
## Joining, by = "word"
# Assign the top word
top_word <- "oil"
russian_tweets <- read_csv("./RInputFiles/russian_1.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## .default = col_character(),
## X1 = col_double(),
## external_author_id = col_double(),
## following = col_double(),
## followers = col_double(),
## updates = col_double(),
## retweet = col_double(),
## new_june_2018 = col_double(),
## alt_external_id = col_double(),
## tweet_id = col_double(),
## tco3_step1 = col_logical()
## )
## See spec(...) for full column specifications.
## Warning: 29 parsing failures.
## row col expected actual file
## 4526 tco3_step1 1/0/T/F/TRUE/FALSE http://StopMassIncarceration.net './RInputFiles/russian_1.csv'
## 5281 tco3_step1 1/0/T/F/TRUE/FALSE https://www.youtube.com/watch?v=RZS59mXnKSo './RInputFiles/russian_1.csv'
## 5703 tco3_step1 1/0/T/F/TRUE/FALSE https://twitter.com/intent/user?user_id=4352458761 './RInputFiles/russian_1.csv'
## 5763 tco3_step1 1/0/T/F/TRUE/FALSE https://goo.gl/jfulXo './RInputFiles/russian_1.csv'
## 6089 tco3_step1 1/0/T/F/TRUE/FALSE https://youtu.be/gQM8Bql4IpI './RInputFiles/russian_1.csv'
## .... .......... .................. .................................................. .............................
## See problems(...) for more details.
str(russian_tweets)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 20000 obs. of 22 variables:
## $ X1 : num 1 2 3 4 5 6 7 8 9 10 ...
## $ external_author_id: num 9.06e+17 9.06e+17 9.06e+17 9.06e+17 9.06e+17 ...
## $ author : chr "10_GOP" "10_GOP" "10_GOP" "10_GOP" ...
## $ content : chr "\"We have a sitting Democrat US Senator on trial for corruption and you've barely heard a peep from the mainstr"| __truncated__ "Marshawn Lynch arrives to game in anti-Trump shirt. Judging by his sagging pants the shirt should say Lynch vs."| __truncated__ "Daughter of fallen Navy Sailor delivers powerful monologue on anthem protests, burns her NFL packers gear. #Bo"| __truncated__ "JUST IN: President Trump dedicates Presidents Cup golf tournament trophy to the people of Florida, Texas and Pu"| __truncated__ ...
## $ region : chr "Unknown" "Unknown" "Unknown" "Unknown" ...
## $ language : chr "English" "English" "English" "English" ...
## $ publish_date : chr "10/1/2017 19:58" "10/1/2017 22:43" "10/1/2017 22:50" "10/1/2017 23:52" ...
## $ harvested_date : chr "10/1/2017 19:59" "10/1/2017 22:43" "10/1/2017 22:51" "10/1/2017 23:52" ...
## $ following : num 1052 1054 1054 1062 1050 ...
## $ followers : num 9636 9637 9637 9642 9645 ...
## $ updates : num 253 254 255 256 246 247 248 249 250 251 ...
## $ post_type : chr NA NA "RETWEET" NA ...
## $ account_type : chr "Right" "Right" "Right" "Right" ...
## $ retweet : num 0 0 1 0 1 0 1 0 0 0 ...
## $ account_category : chr "RightTroll" "RightTroll" "RightTroll" "RightTroll" ...
## $ new_june_2018 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ alt_external_id : num 9.06e+17 9.06e+17 9.06e+17 9.06e+17 9.06e+17 ...
## $ tweet_id : num 9.15e+17 9.15e+17 9.15e+17 9.15e+17 9.14e+17 ...
## $ article_url : chr "http://twitter.com/905874659358453760/statuses/914580356430536707" "http://twitter.com/905874659358453760/statuses/914621840496189440" "http://twitter.com/905874659358453760/statuses/914623490375979008" "http://twitter.com/905874659358453760/statuses/914639143690555392" ...
## $ tco1_step1 : chr "https://twitter.com/10_gop/status/914580356430536707/video/1" "https://twitter.com/damienwoody/status/914568524449959937/video/1" "https://twitter.com/10_gop/status/913231923715198976/video/1" "https://twitter.com/10_gop/status/914639143690555392/video/1" ...
## $ tco2_step1 : chr NA NA NA NA ...
## $ tco3_step1 : logi NA NA NA NA NA NA ...
## - attr(*, "problems")=Classes 'tbl_df', 'tbl' and 'data.frame': 29 obs. of 5 variables:
## ..$ row : int 4526 5281 5703 5763 6089 6098 6119 6238 6903 7516 ...
## ..$ col : chr "tco3_step1" "tco3_step1" "tco3_step1" "tco3_step1" ...
## ..$ expected: chr "1/0/T/F/TRUE/FALSE" "1/0/T/F/TRUE/FALSE" "1/0/T/F/TRUE/FALSE" "1/0/T/F/TRUE/FALSE" ...
## ..$ actual : chr "http://StopMassIncarceration.net" "https://www.youtube.com/watch?v=RZS59mXnKSo" "https://twitter.com/intent/user?user_id=4352458761" "https://goo.gl/jfulXo" ...
## ..$ file : chr "'./RInputFiles/russian_1.csv'" "'./RInputFiles/russian_1.csv'" "'./RInputFiles/russian_1.csv'" "'./RInputFiles/russian_1.csv'" ...
## - attr(*, "spec")=
## .. cols(
## .. X1 = col_double(),
## .. external_author_id = col_double(),
## .. author = col_character(),
## .. content = col_character(),
## .. region = col_character(),
## .. language = col_character(),
## .. publish_date = col_character(),
## .. harvested_date = col_character(),
## .. following = col_double(),
## .. followers = col_double(),
## .. updates = col_double(),
## .. post_type = col_character(),
## .. account_type = col_character(),
## .. retweet = col_double(),
## .. account_category = col_character(),
## .. new_june_2018 = col_double(),
## .. alt_external_id = col_double(),
## .. tweet_id = col_double(),
## .. article_url = col_character(),
## .. tco1_step1 = col_character(),
## .. tco2_step1 = col_character(),
## .. tco3_step1 = col_logical()
## .. )
# Create a corpus
tweet_corpus <- tm::VCorpus(tm::VectorSource(russian_tweets$content))
# Attach following and followers
NLP::meta(tweet_corpus, 'following') <- russian_tweets$following
NLP::meta(tweet_corpus, 'followers') <- russian_tweets$followers
# Review the meta data
head(NLP::meta(tweet_corpus))
## following followers
## 1 1052 9636
## 2 1054 9637
## 3 1054 9637
## 4 1062 9642
## 5 1050 9645
## 6 1050 9644
# Count occurrence by question and word
words <- crude_tibble %>%
tidytext::unnest_tokens(output = "word", token = "words", input = text) %>%
anti_join(tidytext::stop_words) %>%
count(id, word, sort=TRUE)
## Joining, by = "word"
# How different word/article combinations are there?
unique_combinations <- nrow(words)
# Filter to responses with the word "prices"
words %>%
filter(word == "prices")
## # A tibble: 15 x 3
## id word n
## <chr> <chr> <int>
## 1 10 prices 9
## 2 11 prices 5
## 3 13 prices 5
## 4 2 prices 5
## 5 6 prices 5
## 6 1 prices 3
## 7 19 prices 3
## 8 14 prices 2
## 9 16 prices 2
## 10 17 prices 2
## 11 18 prices 2
## 12 8 prices 2
## 13 12 prices 1
## 14 7 prices 1
## 15 9 prices 1
# How many articles had the word "prices"?
number_of_price_articles <- 15
# Tokenize and remove stop words
tidy_tweets <- russian_tweets %>%
tidytext::unnest_tokens(word, content) %>%
anti_join(tidytext::stop_words)
## Joining, by = "word"
# Count by word
unique_words <- tidy_tweets %>%
count(word)
# Count by tweet (tweet_id) and word
unique_words_by_tweet <- tidy_tweets %>%
count(tweet_id, word)
# Find the size of matrix: rows x columns
size <- nrow(russian_tweets) * length(unique(tidy_tweets$word))
percent <- nrow(unique_words_by_tweet) / size
percent
## [1] 0.0002028352
# Create a tibble with TFIDF values
crude_weights <- crude_tibble %>%
tidytext::unnest_tokens(output = "word", token = "words", input = text) %>%
anti_join(tidytext::stop_words) %>%
count(id, word) %>%
tidytext::bind_tf_idf(word, id, n)
## Joining, by = "word"
# Find the highest TFIDF values
crude_weights %>%
arrange(desc(tf_idf))
## # A tibble: 1,494 x 6
## id word n tf idf tf_idf
## <chr> <chr> <int> <dbl> <dbl> <dbl>
## 1 20 january 4 0.0930 2.30 0.214
## 2 15 power 4 0.0690 3.00 0.207
## 3 19 futures 9 0.0643 3.00 0.193
## 4 8 8 6 0.0619 3.00 0.185
## 5 3 canada 2 0.0526 3.00 0.158
## 6 3 canadian 2 0.0526 3.00 0.158
## 7 15 ship 3 0.0517 3.00 0.155
## 8 19 nymex 7 0.05 3.00 0.150
## 9 20 cubic 2 0.0465 3.00 0.139
## 10 20 fiscales 2 0.0465 3.00 0.139
## # ... with 1,484 more rows
# Find the lowest non-zero TFIDF values
crude_weights %>%
filter(tf_idf != 0) %>%
arrange(tf_idf)
## # A tibble: 1,454 x 6
## id word n tf idf tf_idf
## <chr> <chr> <int> <dbl> <dbl> <dbl>
## 1 7 prices 1 0.00452 0.288 0.00130
## 2 9 prices 1 0.00521 0.288 0.00150
## 3 7 dlrs 1 0.00452 0.598 0.00271
## 4 7 opec 1 0.00452 0.693 0.00314
## 5 9 opec 1 0.00521 0.693 0.00361
## 6 7 mln 1 0.00452 0.799 0.00361
## 7 7 petroleum 1 0.00452 0.799 0.00361
## 8 11 petroleum 1 0.00455 0.799 0.00363
## 9 6 barrels 1 0.00429 0.916 0.00393
## 10 6 industry 1 0.00429 0.916 0.00393
## # ... with 1,444 more rows
# Create word counts
animal_farm_counts <- animal_farm %>%
tidytext::unnest_tokens(word, text_column) %>%
count(chapter, word)
# Calculate the cosine similarity
comparisons <- animal_farm_counts %>%
widyr::pairwise_similarity(chapter, word, n) %>%
arrange(desc(similarity))
# Print the mean of the similarity values
comparisons %>%
summarize(mean = mean(similarity)) # very high similarities due to stop words
## # A tibble: 1 x 1
## mean
## <dbl>
## 1 0.949
# Create word counts
animal_farm_counts <- animal_farm %>%
tidytext::unnest_tokens(word, text_column) %>%
anti_join(tidytext::stop_words) %>%
count(chapter, word) %>%
tidytext::bind_tf_idf(chapter, word, n)
## Joining, by = "word"
# Calculate cosine similarity on word counts
animal_farm_counts %>%
widyr::pairwise_similarity(chapter, word, n) %>%
arrange(desc(similarity))
## # A tibble: 90 x 3
## item1 item2 similarity
## <chr> <chr> <dbl>
## 1 Chapter 8 Chapter 7 0.696
## 2 Chapter 7 Chapter 8 0.696
## 3 Chapter 7 Chapter 5 0.693
## 4 Chapter 5 Chapter 7 0.693
## 5 Chapter 8 Chapter 5 0.642
## 6 Chapter 5 Chapter 8 0.642
## 7 Chapter 7 Chapter 6 0.641
## 8 Chapter 6 Chapter 7 0.641
## 9 Chapter 6 Chapter 10 0.638
## 10 Chapter 10 Chapter 6 0.638
## # ... with 80 more rows
# Calculate cosine similarity using tf_idf values
animal_farm_counts %>%
widyr::pairwise_similarity(chapter, word, tf_idf) %>%
arrange(desc(similarity))
## # A tibble: 90 x 3
## item1 item2 similarity
## <chr> <chr> <dbl>
## 1 Chapter 8 Chapter 7 0.0580
## 2 Chapter 7 Chapter 8 0.0580
## 3 Chapter 9 Chapter 8 0.0525
## 4 Chapter 8 Chapter 9 0.0525
## 5 Chapter 7 Chapter 5 0.0467
## 6 Chapter 5 Chapter 7 0.0467
## 7 Chapter 9 Chapter 10 0.0446
## 8 Chapter 10 Chapter 9 0.0446
## 9 Chapter 9 Chapter 7 0.0432
## 10 Chapter 7 Chapter 9 0.0432
## # ... with 80 more rows
Chapter 3 - Applications: Classification and Topic Modeling
Preparing Text for Modeling:
Classification Modeling:
Introduction to Topic Modeling:
LDA in Practice:
Example code includes:
# Stem the tokens
russian_tokens <- russian_tweets %>%
tidytext::unnest_tokens(output = "word", token = "words", input = content) %>%
anti_join(tidytext::stop_words) %>%
mutate(word = SnowballC::wordStem(word))
## Joining, by = "word"
# Create a document term matrix
tweet_matrix <- russian_tokens %>%
count(tweet_id, word) %>%
tidytext::cast_dtm(document = tweet_id, term = word, value = n, weighting = tm::weightTfIdf)
# Print the matrix details
tweet_matrix
## <<DocumentTermMatrix (documents: 19986, terms: 38909)>>
## Non-/sparse entries: 176707/777458567
## Sparsity : 100%
## Maximal term length: 37
## Weighting : term frequency - inverse document frequency (normalized) (tf-idf)
less_sparse_matrix <- tm::removeSparseTerms(tweet_matrix, sparse = 0.5)
# Print results
tweet_matrix
## <<DocumentTermMatrix (documents: 19986, terms: 38909)>>
## Non-/sparse entries: 176707/777458567
## Sparsity : 100%
## Maximal term length: 37
## Weighting : term frequency - inverse document frequency (normalized) (tf-idf)
less_sparse_matrix
## <<DocumentTermMatrix (documents: 19986, terms: 2)>>
## Non-/sparse entries: 27527/12445
## Sparsity : 31%
## Maximal term length: 4
## Weighting : term frequency - inverse document frequency (normalized) (tf-idf)
less_sparse_matrix <- tm::removeSparseTerms(tweet_matrix, sparse = 0.9)
# Print results
tweet_matrix
## <<DocumentTermMatrix (documents: 19986, terms: 38909)>>
## Non-/sparse entries: 176707/777458567
## Sparsity : 100%
## Maximal term length: 37
## Weighting : term frequency - inverse document frequency (normalized) (tf-idf)
less_sparse_matrix
## <<DocumentTermMatrix (documents: 19986, terms: 2)>>
## Non-/sparse entries: 27527/12445
## Sparsity : 31%
## Maximal term length: 4
## Weighting : term frequency - inverse document frequency (normalized) (tf-idf)
less_sparse_matrix <- tm::removeSparseTerms(tweet_matrix, sparse = 0.99)
# Print results
tweet_matrix
## <<DocumentTermMatrix (documents: 19986, terms: 38909)>>
## Non-/sparse entries: 176707/777458567
## Sparsity : 100%
## Maximal term length: 37
## Weighting : term frequency - inverse document frequency (normalized) (tf-idf)
less_sparse_matrix
## <<DocumentTermMatrix (documents: 19986, terms: 56)>>
## Non-/sparse entries: 48853/1070363
## Sparsity : 96%
## Maximal term length: 14
## Weighting : term frequency - inverse document frequency (normalized) (tf-idf)
less_sparse_matrix <- tm::removeSparseTerms(tweet_matrix, sparse =0.9999)
# Print results
tweet_matrix
## <<DocumentTermMatrix (documents: 19986, terms: 38909)>>
## Non-/sparse entries: 176707/777458567
## Sparsity : 100%
## Maximal term length: 37
## Weighting : term frequency - inverse document frequency (normalized) (tf-idf)
less_sparse_matrix
## <<DocumentTermMatrix (documents: 19986, terms: 9566)>>
## Non-/sparse entries: 147364/191038712
## Sparsity : 100%
## Maximal term length: 27
## Weighting : term frequency - inverse document frequency (normalized) (tf-idf)
set.seed(2001021530)
rightTweet <- russian_tweets %>%
filter(account_type=="Right") %>%
sample_n(2000)
leftTweet <- russian_tweets %>%
filter(account_type=="Left") %>%
sample_n(2000)
idx <- sample(1:4000, 4000, replace=FALSE)
leftRightData <- rbind(rightTweet, leftTweet)[idx, ]
leftRight_tokens <- leftRightData %>%
tidytext::unnest_tokens(output = "word", token = "words", input = content) %>%
anti_join(tidytext::stop_words) %>%
mutate(word = SnowballC::wordStem(word))
## Joining, by = "word"
# Create a document term matrix
left_right_matrix_small <- leftRight_tokens %>%
count(tweet_id, word) %>%
tidytext::cast_dtm(document = tweet_id, term = word, value = n, weighting = tm::weightTfIdf) %>%
tm::removeSparseTerms(sparse = 0.99)
left_right_labels <- c()
for (lbl in rownames(as.matrix(left_right_matrix_small))) {
newPoint <- leftRightData %>%
filter(tweet_id==lbl) %>%
pull(account_type)
left_right_labels <- c(left_right_labels, newPoint)
}
left_right_labels <- as.factor(left_right_labels)
# Create train/test split
set.seed(1111)
sample_size <- floor(0.75 * nrow(left_right_matrix_small))
train_ind <- sample(nrow(left_right_matrix_small), size = sample_size)
train <- left_right_matrix_small[train_ind, ]
test <- left_right_matrix_small[-train_ind, ]
# Create a random forest classifier
rfc <- randomForest::randomForest(x = as.data.frame(as.matrix(train)),
y = left_right_labels[train_ind], nTree = 50
)
# Print the results
rfc
##
## Call:
## randomForest(x = as.data.frame(as.matrix(train)), y = left_right_labels[train_ind], nTree = 50)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 8
##
## OOB estimate of error rate: 19.51%
## Confusion matrix:
## Left Right class.error
## Left 1369 121 0.08120805
## Right 464 1044 0.30769231
# Percentage correctly labeled "Left"
# left <- (350) / (350 + 157)
# left
# Percentage correctly labeled "Right"
# right <- (436) / (436 + 57)
# right
# Overall Accuracy:
# accuracy <- (350 + 436) / (350 + 436 + 57 + 157)
# accuracy
napolSents <- animal_farm %>%
tidytext::unnest_tokens(output = "sentences", input = text_column, token = "sentences") %>%
mutate(sentence_id=row_number(), napoleon=str_detect(sentences, 'napoleon')) %>%
filter(napoleon)
pig_tokens <- napolSents %>%
tidytext::unnest_tokens(output = "word", token = "words", input = sentences) %>%
anti_join(tidytext::stop_words) %>%
mutate(word = SnowballC::wordStem(word))
## Joining, by = "word"
# Create a document term matrix
pig_matrix <- pig_tokens %>%
count(sentence_id, word) %>%
tidytext::cast_dtm(document = sentence_id, term = word, value = n, weighting = tm::weightTf) %>%
tm::removeSparseTerms(sparse=0.995)
# Perform Topic Modeling
sentence_lda <-
topicmodels::LDA(pig_matrix, k = 10, method = 'Gibbs', control = list(seed = 1111))
# Extract the beta matrix
sentence_betas <- generics::tidy(sentence_lda, matrix = "beta")
# Topic #2
sentence_betas %>%
filter(topic == 2) %>%
arrange(-beta)
## # A tibble: 859 x 3
## topic term beta
## <int> <chr> <dbl>
## 1 2 anim 0.0716
## 2 2 comrad 0.0678
## 3 2 windmil 0.0266
## 4 2 snowball' 0.0191
## 5 2 squealer 0.0191
## 6 2 cri 0.0154
## 7 2 forward 0.0154
## 8 2 walk 0.0116
## 9 2 hear 0.0116
## 10 2 command 0.0116
## # ... with 849 more rows
# Topic #10
sentence_betas %>%
filter(topic == 3) %>%
arrange(-beta)
## # A tibble: 859 x 3
## topic term beta
## <int> <chr> <dbl>
## 1 3 napoleon' 0.0524
## 2 3 usual 0.0221
## 3 3 black 0.0178
## 4 3 moment 0.0178
## 5 3 pronounc 0.0178
## 6 3 effort 0.0134
## 7 3 gun 0.0134
## 8 3 boar 0.00909
## 9 3 cast 0.00909
## 10 3 pig 0.00909
## # ... with 849 more rows
# Extract the beta and gamma matrices
sentence_betas <- generics::tidy(sentence_lda, matrix = "beta")
sentence_gammas <- generics::tidy(sentence_lda, matrix = "gamma")
# Explore Topic 5 Betas
sentence_betas %>%
filter(topic == 5) %>%
arrange(-beta)
## # A tibble: 859 x 3
## topic term beta
## <int> <chr> <dbl>
## 1 5 appear 0.0386
## 2 5 leader 0.0271
## 3 5 stood 0.0233
## 4 5 half 0.0157
## 5 5 agreement 0.0157
## 6 5 voic 0.0157
## 7 5 abolish 0.0118
## 8 5 time 0.0118
## 9 5 death 0.0118
## 10 5 drink 0.0118
## # ... with 849 more rows
# Explore Topic 5 Gammas
sentence_gammas %>%
filter(topic == 5) %>%
arrange(-gamma)
## # A tibble: 157 x 3
## document topic gamma
## <chr> <int> <dbl>
## 1 1074 5 0.157
## 2 954 5 0.153
## 3 1152 5 0.152
## 4 1370 5 0.151
## 5 225 5 0.149
## 6 1518 5 0.148
## 7 1171 5 0.147
## 8 1355 5 0.140
## 9 1521 5 0.135
## 10 968 5 0.133
## # ... with 147 more rows
# Print the topic setence for topic 5
napolSents$sentences[which(napolSents$sentence_id == (sentence_gammas %>% group_by(topic) %>%
top_n(1, gamma) %>% filter(topic==5) %>%
pull(document) %>% as.numeric()
)
)
]
## [1] "then a sheep confessed to having urinated in the drinking pool--urged to do this, so she said, by snowball--and two other sheep confessed to having murdered an old ram, an especially devoted follower of napoleon, by chasing him round and round a bonfire when he was suffering from a cough."
right_tokens <- rightTweet %>%
tidytext::unnest_tokens(output = "word", token = "words", input = content) %>%
anti_join(tidytext::stop_words) %>%
mutate(word = SnowballC::wordStem(word))
## Joining, by = "word"
# Create a document term matrix
right_matrix <- right_tokens %>%
count(tweet_id, word) %>%
tidytext::cast_dtm(document = tweet_id, term = word, value = n, weighting = tm::weightTf)
# Setup train and test data
sample_size <- floor(0.90 * nrow(right_matrix))
set.seed(1111)
train_ind <- sample(nrow(right_matrix), size = sample_size)
train <- right_matrix[train_ind, ]
test <- right_matrix[-train_ind, ]
# Peform topic modeling
lda_model <- topicmodels::LDA(train, k = 5, method = "Gibbs",control = list(seed = 1111))
# Train
topicmodels::perplexity(lda_model, newdata = train)
## [1] 596.7166
# Test
topicmodels::perplexity(lda_model, newdata = test)
## [1] 854.6082
# Extract the gamma matrix
gamma_values <- generics::tidy(sentence_lda, matrix = "gamma")
# Create grouped gamma tibble
grouped_gammas <- gamma_values %>%
group_by(document) %>%
arrange(desc(gamma)) %>%
slice(1) %>%
group_by(topic)
# Count by topic
grouped_gammas %>%
tally(topic, sort=TRUE)
## # A tibble: 10 x 2
## topic n
## <int> <int>
## 1 9 99
## 2 8 96
## 3 5 85
## 4 10 80
## 5 6 66
## 6 2 64
## 7 7 63
## 8 4 60
## 9 3 54
## 10 1 24
# Average topic weight for top topic for each sentence
grouped_gammas %>%
summarize(avg=mean(gamma)) %>%
arrange(desc(avg))
## # A tibble: 10 x 2
## topic avg
## <int> <dbl>
## 1 9 0.144
## 2 7 0.137
## 3 1 0.136
## 4 8 0.135
## 5 5 0.135
## 6 6 0.135
## 7 4 0.133
## 8 10 0.132
## 9 2 0.130
## 10 3 0.130
Chapter 4 - Advanced Techniques
Sentiment Analysis:
Word Embeddings:
Additional NLP Analysis:
Wrap Up:
Example code includes:
# Print the lexicon
tidytext::get_sentiments("bing")
## # A tibble: 6,786 x 2
## word sentiment
## <chr> <chr>
## 1 2-faces negative
## 2 abnormal negative
## 3 abolish negative
## 4 abominable negative
## 5 abominably negative
## 6 abominate negative
## 7 abomination negative
## 8 abort negative
## 9 aborted negative
## 10 aborts negative
## # ... with 6,776 more rows
# Count the different sentiment types
tidytext::get_sentiments("bing") %>%
count(sentiment) %>%
arrange(desc(n))
## # A tibble: 2 x 2
## sentiment n
## <chr> <int>
## 1 negative 4781
## 2 positive 2005
# Count the different sentiment types
tidytext::get_sentiments("loughran") %>%
count(sentiment) %>%
arrange(desc(n))
## # A tibble: 6 x 2
## sentiment n
## <chr> <int>
## 1 negative 2355
## 2 litigious 904
## 3 positive 354
## 4 uncertainty 297
## 5 constraining 184
## 6 superfluous 56
# Count how many times each score was used
tidytext::get_sentiments("afinn") %>%
count(value) %>%
arrange(desc(n))
## # A tibble: 11 x 2
## value n
## <dbl> <int>
## 1 -2 966
## 2 2 448
## 3 -1 309
## 4 -3 264
## 5 1 208
## 6 3 172
## 7 4 45
## 8 -4 43
## 9 -5 16
## 10 5 5
## 11 0 1
afSents <- animal_farm %>%
tidytext::unnest_tokens(output = "sentence", input = text_column, token = "sentences") %>%
mutate(sentence_id=row_number())
# Print the overall sentiment associated with each pig's sentences
for(name in c("napoleon", "snowball", "squealer")) {
# Filter to the sentences mentioning the pig
pig_sentences <- afSents[grepl(name, afSents$sentence), ]
# Tokenize the text
temp_tokens <- pig_sentences %>%
tidytext::unnest_tokens(output = "word", token = "words", input = sentence) %>%
anti_join(tidytext::stop_words)
# Use afinn to find the overall sentiment score
result <- temp_tokens %>%
inner_join(tidytext::get_sentiments("afinn")) %>%
summarise(sentiment = sum(value))
# Print the result
print(paste0(name, ": ", result$sentiment))
}
## Joining, by = "word"
## Joining, by = "word"
## [1] "napoleon: -45"
## Joining, by = "word"
## Joining, by = "word"
## [1] "snowball: -77"
## Joining, by = "word"
## Joining, by = "word"
## [1] "squealer: -30"
left_tokens <- russian_tweets %>%
filter(account_type=="Left") %>%
tidytext::unnest_tokens(output = "word", token = "words", input = content) %>%
anti_join(tidytext::stop_words)
## Joining, by = "word"
# Dictionaries
# anticipation <- tidytext::get_sentiments("bing") %>%
# filter(sentiment == "anticipation")
# joy <- tidytext::get_sentiments("nrc") %>%
# filter(sentiment == "joy")
# Print top words for Anticipation and Joy
# left_tokens %>%
# inner_join(anticipation, by = "word") %>%
# count(word, sort = TRUE)
# left_tokens %>%
# inner_join(joy, by = "word") %>%
# count(word, sort = TRUE)
# Initialize a h2o session
library(h2o)
##
## ----------------------------------------------------------------------
##
## Your next step is to start H2O:
## > h2o.init()
##
## For H2O package documentation, ask for help:
## > ??h2o
##
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit http://docs.h2o.ai
##
## ----------------------------------------------------------------------
##
## Attaching package: 'h2o'
## The following objects are masked from 'package:stats':
##
## cor, sd, var
## The following objects are masked from 'package:base':
##
## %*%, %in%, &&, ||, apply, as.factor, as.numeric, colnames,
## colnames<-, ifelse, is.character, is.factor, is.numeric, log,
## log10, log1p, log2, round, signif, trunc
h2o.init()
## Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 23 hours 41 minutes
## H2O cluster timezone: America/Chicago
## H2O data parsing timezone: UTC
## H2O cluster version: 3.28.0.2
## H2O cluster version age: 1 month and 13 days
## H2O cluster name: H2O_started_from_R_Dave_bvu150
## H2O cluster total nodes: 1
## H2O cluster total memory: 4.20 GB
## H2O cluster total cores: 4
## H2O cluster allowed cores: 4
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## H2O API Extensions: Amazon S3, Algos, AutoML, Core V3, TargetEncoder, Core V4
## R Version: R version 3.6.2 (2019-12-12)
# Create an h2o object for left_right
h2o_object = as.h2o(leftRightData)
##
|
| | 0%
|
|======================================================================| 100%
# Tokenize the words from the column of text in left_right
tweet_words <- h2o.tokenize(h2o_object$content, "\\\\W+")
# Lowercase and remove stopwords
tweet_words <- h2o.tolower(tweet_words)
tweet_words = tweet_words[is.na(tweet_words) || (!tweet_words %in% tidytext::stop_words$word),]
tweet_words
## C1
## 1 defense
## 2 black
## 3 heritage
## 4 https
## 5 in62blh02i
## 6 blacklivesmatter
##
## [43868 rows x 1 column]
# set.seed(1111)
# Use 33% of the available data
# sample_size <- floor(0.33 * nrow(job_titles))
# sample_data <- sample(nrow(job_titles), size = sample_size)
# h2o_object = as.h2o(job_titles[sample_data, ])
# words <- h2o.tokenize(h2o_object$jobtitle, "\\\\W+")
# words <- h2o.tolower(words)
# words = words[is.na(words) || (!words %in% stop_words$word),]
# word2vec_model <- h2o.word2vec(words, min_word_freq=5, epochs = 10)
# Find synonyms for the word "teacher"
# h2o.findSynonyms(word2vec_model, "teacher", count=10)
# a: Labels each word within text as either a noun, verb, adjective, or other category.
# b: A model pre-trained on a vast amount of text data to create a language representation used for supervised learning.
# c: A type of analysis that looks to describe text as either positive or negative and can be used to find active vs passive terms.
# d: A modeling technique used to label entire text into a single category such as relevant or not-relevant.
# Sentiment Analysis
# SA <- c
# Classifcation Modeling
# CM <- d
# BERT
# BERT <- b
# Part-of-speech Tagging
# POS <- a
# e: Modeling techniques, including LDA, used to cluster text into groups or types based on similar words being used.
# f: A method for searching through text and tagging words that distinguish people, locations, or organizations.
# g: Method used to search text for specific patterns.
# h: Representing words using a large vector space where similar words are close together within the vector space.
# Named Entity Recognition
# NER <- f
# Topic Modeling
# TM <- e
# Word Embeddings
# WE <- h
# Regular Expressions
# REGEX <- g
Chapter 1 - Joining Tables
The inner_join verb:
Joining with a one-to-many relationship:
Joining three or more tables:
Example code includes:
parts <- readRDS("./RInputFiles/parts.rds")
part_categories <- readRDS("./RInputFiles/part_categories.rds")
inventory_parts <- readRDS("./RInputFiles/inventory_parts.rds")
inventories <- readRDS("./RInputFiles/inventories.rds")
sets <- readRDS("./RInputFiles/sets.rds")
themes <- readRDS("./RInputFiles/themes.rds")
colors <- readRDS("./RInputFiles/colors.rds")
# Use the suffix argument to replace .x and .y suffixes
parts %>%
inner_join(part_categories, by = c("part_cat_id" = "id"), suffix=c("_part", "_category"))
## # A tibble: 17,501 x 4
## part_num name_part part_cat_id name_category
## <chr> <chr> <dbl> <chr>
## 1 0901 Baseplate 16 x 30 with Set 080 Yello~ 1 Baseplates
## 2 0902 Baseplate 16 x 24 with Set 080 Small~ 1 Baseplates
## 3 0903 Baseplate 16 x 24 with Set 080 Red H~ 1 Baseplates
## 4 0904 Baseplate 16 x 24 with Set 080 Large~ 1 Baseplates
## 5 1 Homemaker Bookcase 2 x 4 x 4 7 Containers
## 6 10016414 Sticker Sheet #1 for 41055-1 58 Stickers
## 7 10026stk01 Sticker for Set 10026 - (44942/41841~ 58 Stickers
## 8 10039 Pullback Motor 8 x 4 x 2/3 44 Mechanical
## 9 10048 Minifig Hair Tousled 65 Minifig Headwear
## 10 10049 Minifig Shield Broad with Spiked Bot~ 27 Minifig Accesso~
## # ... with 17,491 more rows
# Combine the parts and inventory_parts tables
parts %>%
inner_join(inventory_parts, by=c("part_num"))
## # A tibble: 258,958 x 6
## part_num name part_cat_id inventory_id color_id quantity
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 0901 Baseplate 16 x 30 with S~ 1 1973 2 1
## 2 0902 Baseplate 16 x 24 with S~ 1 1973 2 1
## 3 0903 Baseplate 16 x 24 with S~ 1 1973 2 1
## 4 0904 Baseplate 16 x 24 with S~ 1 1973 2 1
## 5 1 Homemaker Bookcase 2 x 4~ 7 508 15 1
## 6 1 Homemaker Bookcase 2 x 4~ 7 1158 15 2
## 7 1 Homemaker Bookcase 2 x 4~ 7 6590 15 2
## 8 1 Homemaker Bookcase 2 x 4~ 7 9679 15 2
## 9 1 Homemaker Bookcase 2 x 4~ 7 12256 1 2
## 10 1 Homemaker Bookcase 2 x 4~ 7 13356 15 1
## # ... with 258,948 more rows
# Combine the parts and inventory_parts tables
inventory_parts %>%
inner_join(parts, by="part_num")
## # A tibble: 258,958 x 6
## inventory_id part_num color_id quantity name part_cat_id
## <dbl> <chr> <dbl> <dbl> <chr> <dbl>
## 1 21 3009 7 50 Brick 1 x 6 11
## 2 25 21019c00pa~ 15 1 Legs and Hips with Bl~ 61
## 3 25 24629pr0002 78 1 Minifig Head Special ~ 59
## 4 25 24634pr0001 5 1 Headwear Accessory Bo~ 27
## 5 25 24782pr0001 5 1 Minifig Hipwear Skirt~ 27
## 6 25 88646 0 1 Tile Special 4 x 3 wi~ 15
## 7 25 973pr3314c~ 5 1 Torso with 1 White Bu~ 60
## 8 26 14226c11 0 3 String with End Studs~ 31
## 9 26 2340px2 15 1 Tail 4 x 1 x 3 with '~ 35
## 10 26 2340px3 15 1 Tail 4 x 1 x 3 with '~ 35
## # ... with 258,948 more rows
sets %>%
# Add inventories using an inner join
inner_join(inventories, by="set_num") %>%
# Add inventory_parts using an inner join
inner_join(inventory_parts, by=c("id"="inventory_id"))
## # A tibble: 258,958 x 9
## set_num name year theme_id id version part_num color_id quantity
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <chr> <dbl> <dbl>
## 1 700.3-1 Medium Gift ~ 1949 365 24197 1 bdoor01 2 2
## 2 700.3-1 Medium Gift ~ 1949 365 24197 1 bdoor01 15 1
## 3 700.3-1 Medium Gift ~ 1949 365 24197 1 bdoor01 4 1
## 4 700.3-1 Medium Gift ~ 1949 365 24197 1 bslot02 15 6
## 5 700.3-1 Medium Gift ~ 1949 365 24197 1 bslot02 2 6
## 6 700.3-1 Medium Gift ~ 1949 365 24197 1 bslot02 4 6
## 7 700.3-1 Medium Gift ~ 1949 365 24197 1 bslot02 1 6
## 8 700.3-1 Medium Gift ~ 1949 365 24197 1 bslot02 14 6
## 9 700.3-1 Medium Gift ~ 1949 365 24197 1 bslot02a 15 6
## 10 700.3-1 Medium Gift ~ 1949 365 24197 1 bslot02a 2 6
## # ... with 258,948 more rows
# Count the number of colors and sort
sets %>%
inner_join(inventories, by = "set_num") %>%
inner_join(inventory_parts, by = c("id" = "inventory_id")) %>%
inner_join(colors, by = c("color_id" = "id"), suffix = c("_set", "_color")) %>%
count(name_color, sort=TRUE)
## # A tibble: 134 x 2
## name_color n
## <chr> <int>
## 1 Black 48068
## 2 White 30105
## 3 Light Bluish Gray 26024
## 4 Red 21602
## 5 Dark Bluish Gray 19948
## 6 Yellow 17088
## 7 Blue 12980
## 8 Light Gray 8632
## 9 Reddish Brown 6960
## 10 Tan 6664
## # ... with 124 more rows
Chapter 2 - Left and Right Joins
The left_join verb:
The right_join verb:
Joining tables to themselves:
Example code includes:
inventory_parts_joined <- inventory_parts %>%
inner_join(inventories, by=c("inventory_id"="id")) %>%
select(set_num, part_num, color_id, quantity)
str(inventory_parts_joined)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 258958 obs. of 4 variables:
## $ set_num : chr "3474-1" "71012-11" "71012-11" "71012-11" ...
## $ part_num: chr "3009" "21019c00pat004pr1033" "24629pr0002" "24634pr0001" ...
## $ color_id: num 7 15 78 5 5 0 5 0 15 15 ...
## $ quantity: num 50 1 1 1 1 1 1 3 1 1 ...
millennium_falcon <- inventory_parts_joined %>%
filter(set_num == "7965-1")
str(millennium_falcon)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 263 obs. of 4 variables:
## $ set_num : chr "7965-1" "7965-1" "7965-1" "7965-1" ...
## $ part_num: chr "12825" "2412b" "2412b" "2419" ...
## $ color_id: num 72 72 320 71 0 71 71 72 0 19 ...
## $ quantity: num 3 20 2 1 4 1 7 2 1 2 ...
star_destroyer <- inventory_parts_joined %>%
filter(set_num == "75190-1")
str(star_destroyer)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 293 obs. of 4 variables:
## $ set_num : chr "75190-1" "75190-1" "75190-1" "75190-1" ...
## $ part_num: chr "10247" "11203" "11212" "11212" ...
## $ color_id: num 0 0 72 71 72 71 0 72 71 0 ...
## $ quantity: num 12 6 1 6 8 1 8 1 2 1 ...
# Combine the star_destroyer and millennium_falcon tables
millennium_falcon %>%
left_join(star_destroyer, by=c("part_num", "color_id"), suffix=c("_falcon", "_star_destroyer"))
## # A tibble: 263 x 6
## set_num_falcon part_num color_id quantity_falcon set_num_star_de~
## <chr> <chr> <dbl> <dbl> <chr>
## 1 7965-1 12825 72 3 <NA>
## 2 7965-1 2412b 72 20 75190-1
## 3 7965-1 2412b 320 2 <NA>
## 4 7965-1 2419 71 1 <NA>
## 5 7965-1 2420 0 4 75190-1
## 6 7965-1 2420 71 1 <NA>
## 7 7965-1 2420 71 7 <NA>
## 8 7965-1 2431 72 2 <NA>
## 9 7965-1 2431 0 1 75190-1
## 10 7965-1 2431 19 2 <NA>
## # ... with 253 more rows, and 1 more variable: quantity_star_destroyer <dbl>
# Aggregate Millennium Falcon for the total quantity in each part
millennium_falcon_colors <- millennium_falcon %>%
group_by(color_id) %>%
summarize(total_quantity = sum(quantity))
# Aggregate Star Destroyer for the total quantity in each part
star_destroyer_colors <- star_destroyer %>%
group_by(color_id) %>%
summarize(total_quantity = sum(quantity))
# Left join the Millennium Falcon colors to the Star Destroyer colors
millennium_falcon_colors %>%
left_join(star_destroyer_colors, by="color_id", suffix=c("_falcon", "_star_destroyer"))
## # A tibble: 21 x 3
## color_id total_quantity_falcon total_quantity_star_destroyer
## <dbl> <dbl> <dbl>
## 1 0 201 336
## 2 1 15 23
## 3 4 17 53
## 4 14 3 4
## 5 15 15 17
## 6 19 95 12
## 7 28 3 16
## 8 33 5 NA
## 9 36 1 14
## 10 41 6 15
## # ... with 11 more rows
inventory_version_1 <- inventories %>%
filter(version == 1)
# Join versions to sets
sets %>%
left_join(inventory_version_1, by="set_num") %>%
# Filter for where version is na
filter(is.na(version))
## # A tibble: 1 x 6
## set_num name year theme_id id version
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 40198-1 Ludo game 2018 598 NA NA
parts %>%
count(part_cat_id) %>%
right_join(part_categories, by = c("part_cat_id" = "id")) %>%
# Filter for NA
filter(is.na(n))
## # A tibble: 1 x 3
## part_cat_id n name
## <dbl> <int> <chr>
## 1 66 NA Modulex
parts %>%
count(part_cat_id) %>%
right_join(part_categories, by = c("part_cat_id" = "id")) %>%
# Use replace_na to replace missing values in the n column
replace_na(list(n=0))
## # A tibble: 64 x 3
## part_cat_id n name
## <dbl> <dbl> <chr>
## 1 1 135 Baseplates
## 2 3 303 Bricks Sloped
## 3 4 1900 Duplo, Quatro and Primo
## 4 5 107 Bricks Special
## 5 6 128 Bricks Wedged
## 6 7 97 Containers
## 7 8 24 Technic Bricks
## 8 9 167 Plates Special
## 9 11 490 Bricks
## 10 12 85 Technic Connectors
## # ... with 54 more rows
themes %>%
# Inner join the themes table
inner_join(themes, by=c("id"="parent_id"), suffix=c("_parent", "_child")) %>%
# Filter for the "Harry Potter" parent name
filter(name_parent=="Harry Potter")
## # A tibble: 6 x 5
## id name_parent parent_id id_child name_child
## <dbl> <chr> <dbl> <dbl> <chr>
## 1 246 Harry Potter NA 247 Chamber of Secrets
## 2 246 Harry Potter NA 248 Goblet of Fire
## 3 246 Harry Potter NA 249 Order of the Phoenix
## 4 246 Harry Potter NA 250 Prisoner of Azkaban
## 5 246 Harry Potter NA 251 Sorcerer's Stone
## 6 246 Harry Potter NA 667 Fantastic Beasts
# Join themes to itself again to find the grandchild relationships
themes %>%
inner_join(themes, by = c("id" = "parent_id"), suffix = c("_parent", "_child")) %>%
inner_join(themes, by = c("id_child" = "parent_id"), suffix = c("_parent", "_grandchild"))
## # A tibble: 158 x 7
## id_parent name_parent parent_id id_child name_child id_grandchild name
## <dbl> <chr> <dbl> <dbl> <chr> <dbl> <chr>
## 1 1 Technic NA 5 Model 6 Airport
## 2 1 Technic NA 5 Model 7 Constructi~
## 3 1 Technic NA 5 Model 8 Farm
## 4 1 Technic NA 5 Model 9 Fire
## 5 1 Technic NA 5 Model 10 Harbor
## 6 1 Technic NA 5 Model 11 Off-Road
## 7 1 Technic NA 5 Model 12 Race
## 8 1 Technic NA 5 Model 13 Riding Cyc~
## 9 1 Technic NA 5 Model 14 Robot
## 10 1 Technic NA 5 Model 15 Traffic
## # ... with 148 more rows
themes %>%
# Left join the themes table to its own children
left_join(themes, by=c("id"="parent_id"), suffix=c("_parent", "_child")) %>%
# Filter for themes that have no child themes
filter(is.na(id_child))
## # A tibble: 586 x 5
## id name_parent parent_id id_child name_child
## <dbl> <chr> <dbl> <dbl> <chr>
## 1 2 Arctic Technic 1 NA <NA>
## 2 3 Competition 1 NA <NA>
## 3 4 Expert Builder 1 NA <NA>
## 4 6 Airport 5 NA <NA>
## 5 7 Construction 5 NA <NA>
## 6 8 Farm 5 NA <NA>
## 7 9 Fire 5 NA <NA>
## 8 10 Harbor 5 NA <NA>
## 9 11 Off-Road 5 NA <NA>
## 10 12 Race 5 NA <NA>
## # ... with 576 more rows
Chapter 3 - Full, Semi, and Anti Joins
The full_join verb:
The semi and anti-join verbs:
Visualizing set differences:
Example code includes:
inventory_parts_joined <- inventories %>%
inner_join(inventory_parts, by = c("id" = "inventory_id")) %>%
arrange(desc(quantity)) %>%
select(-id, -version)
str(inventory_parts_joined)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 258958 obs. of 4 variables:
## $ set_num : chr "40179-1" "40179-1" "40179-1" "40179-1" ...
## $ part_num: chr "3024" "3024" "3024" "3024" ...
## $ color_id: num 72 15 0 71 14 15 320 0 0 0 ...
## $ quantity: num 900 900 900 900 900 810 771 720 684 540 ...
inventory_parts_joined %>%
# Combine the sets table with inventory_parts_joined
inner_join(sets, by=c("set_num"="set_num")) %>%
# Combine the themes table with your first join
inner_join(themes, by=c("theme_id"="id"), suffix=c("_set", "_theme"))
## # A tibble: 258,958 x 9
## set_num part_num color_id quantity name_set year theme_id name_theme
## <chr> <chr> <dbl> <dbl> <chr> <dbl> <dbl> <chr>
## 1 40179-1 3024 72 900 Persona~ 2016 277 Mosaic
## 2 40179-1 3024 15 900 Persona~ 2016 277 Mosaic
## 3 40179-1 3024 0 900 Persona~ 2016 277 Mosaic
## 4 40179-1 3024 71 900 Persona~ 2016 277 Mosaic
## 5 40179-1 3024 14 900 Persona~ 2016 277 Mosaic
## 6 k34434~ 3024 15 810 Lego Mo~ 2003 277 Mosaic
## 7 21010-1 3023 320 771 Robie H~ 2011 252 Architect~
## 8 k34431~ 3024 0 720 Lego Mo~ 2003 277 Mosaic
## 9 42083-1 2780 0 684 Bugatti~ 2018 5 Model
## 10 k34434~ 3024 0 540 Lego Mo~ 2003 277 Mosaic
## # ... with 258,948 more rows, and 1 more variable: parent_id <dbl>
inventory_sets_themes <- inventory_parts_joined %>%
inner_join(sets, by = "set_num") %>%
inner_join(themes, by = c("theme_id" = "id"), suffix = c("_set", "_theme"))
str(inventory_sets_themes)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 258958 obs. of 9 variables:
## $ set_num : chr "40179-1" "40179-1" "40179-1" "40179-1" ...
## $ part_num : chr "3024" "3024" "3024" "3024" ...
## $ color_id : num 72 15 0 71 14 15 320 0 0 0 ...
## $ quantity : num 900 900 900 900 900 810 771 720 684 540 ...
## $ name_set : chr "Personalised Mosaic Portrait" "Personalised Mosaic Portrait" "Personalised Mosaic Portrait" "Personalised Mosaic Portrait" ...
## $ year : num 2016 2016 2016 2016 2016 ...
## $ theme_id : num 277 277 277 277 277 277 252 277 5 277 ...
## $ name_theme: chr "Mosaic" "Mosaic" "Mosaic" "Mosaic" ...
## $ parent_id : num 276 276 276 276 276 276 NA 276 1 276 ...
batman <- inventory_sets_themes %>%
filter(name_theme == "Batman")
str(batman)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 3783 obs. of 9 variables:
## $ set_num : chr "7787-1" "70904-1" "70904-1" "77903-1" ...
## $ part_num : chr "3873" "6141" "4032a" "3023" ...
## $ color_id : num 0 84 84 46 0 84 179 0 72 34 ...
## $ quantity : num 158 81 67 46 44 41 31 28 28 26 ...
## $ name_set : chr "The Bat-Tank: The Riddler and Bane's Hideout" "Clayface Splat Attack" "Clayface Splat Attack" "The Dark Knight of Gotham City" ...
## $ year : num 2007 2017 2017 2019 2007 ...
## $ theme_id : num 484 484 484 484 484 484 484 484 484 484 ...
## $ name_theme: chr "Batman" "Batman" "Batman" "Batman" ...
## $ parent_id : num 482 482 482 482 482 482 482 482 482 482 ...
star_wars <- inventory_sets_themes %>%
filter(name_theme == "Star Wars")
str(star_wars)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 5402 obs. of 9 variables:
## $ set_num : chr "7194-1" "7194-1" "7194-1" "75244-1" ...
## $ part_num : chr "2357" "3001" "2420" "2780" ...
## $ color_id : num 19 19 378 0 19 19 19 0 0 15 ...
## $ quantity : num 84 73 72 64 57 55 54 54 52 49 ...
## $ name_set : chr "Yoda" "Yoda" "Yoda" "Tantive IV" ...
## $ year : num 2002 2002 2002 2019 2002 ...
## $ theme_id : num 158 158 158 158 158 158 158 158 158 158 ...
## $ name_theme: chr "Star Wars" "Star Wars" "Star Wars" "Star Wars" ...
## $ parent_id : num NA NA NA NA NA NA NA NA NA NA ...
# Count the part number and color id, weight by quantity
(batman_parts <- batman %>%
count(part_num, color_id, wt=quantity))
## # A tibble: 2,071 x 3
## part_num color_id n
## <chr> <dbl> <dbl>
## 1 10113 0 11
## 2 10113 272 1
## 3 10113 320 1
## 4 10183 57 1
## 5 10190 0 2
## 6 10201 0 1
## 7 10201 4 3
## 8 10201 14 1
## 9 10201 15 6
## 10 10201 71 4
## # ... with 2,061 more rows
(star_wars_parts <- star_wars %>%
count(part_num, color_id, wt=quantity))
## # A tibble: 2,413 x 3
## part_num color_id n
## <chr> <dbl> <dbl>
## 1 10169 4 1
## 2 10197 0 2
## 3 10197 72 3
## 4 10201 0 21
## 5 10201 71 5
## 6 10247 0 9
## 7 10247 71 16
## 8 10247 72 12
## 9 10884 28 1
## 10 10928 72 6
## # ... with 2,403 more rows
(parts_joined <- batman_parts %>%
# Combine the star_wars_parts table
full_join(star_wars_parts, by=c("part_num", "color_id"), suffix=c("_batman", "_star_wars")) %>%
# Replace NAs with 0s in the n_batman and n_star_wars columns
replace_na(list(n_batman=0, n_star_wars=0)))
## # A tibble: 3,628 x 4
## part_num color_id n_batman n_star_wars
## <chr> <dbl> <dbl> <dbl>
## 1 10113 0 11 0
## 2 10113 272 1 0
## 3 10113 320 1 0
## 4 10183 57 1 0
## 5 10190 0 2 0
## 6 10201 0 1 21
## 7 10201 4 3 0
## 8 10201 14 1 0
## 9 10201 15 6 0
## 10 10201 71 4 5
## # ... with 3,618 more rows
parts_joined %>%
# Sort the number of star wars pieces in descending order
arrange(-n_star_wars) %>%
# Join the colors table to the parts_joined table
left_join(colors, by=c("color_id"="id")) %>%
# Join the parts table to the previous join
left_join(parts, by=c("part_num"), suffix=c("_color", "_part"))
## # A tibble: 3,628 x 8
## part_num color_id n_batman n_star_wars name_color rgb name_part part_cat_id
## <chr> <dbl> <dbl> <dbl> <chr> <chr> <chr> <dbl>
## 1 2780 0 104 392 Black #051~ Technic ~ 53
## 2 32062 0 1 141 Black #051~ Technic ~ 46
## 3 4274 1 56 118 Blue #005~ Technic ~ 53
## 4 6141 36 11 117 Trans-Red #C91~ Plate Ro~ 21
## 5 3023 71 10 106 Light Blu~ #A0A~ Plate 1 ~ 14
## 6 6558 1 30 106 Blue #005~ Technic ~ 53
## 7 43093 1 44 99 Blue #005~ Technic ~ 53
## 8 3022 72 14 95 Dark Blui~ #6C6~ Plate 2 ~ 14
## 9 2357 19 0 84 Tan #E4C~ Brick 2 ~ 11
## 10 6141 179 90 81 Flat Silv~ #898~ Plate Ro~ 21
## # ... with 3,618 more rows
batmobile <- inventory_parts_joined %>%
filter(set_num == "7784-1") %>%
select(-set_num)
str(batmobile)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 173 obs. of 3 variables:
## $ part_num: chr "3023" "2780" "50950" "3004" ...
## $ color_id: num 72 0 0 71 1 0 0 0 14 0 ...
## $ quantity: num 62 28 28 26 25 23 21 21 19 18 ...
batwing <- inventory_parts_joined %>%
filter(set_num == "70916-1") %>%
select(-set_num)
str(batwing)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 309 obs. of 3 variables:
## $ part_num: chr "3023" "3024" "3623" "11477" ...
## $ color_id: num 0 0 0 0 71 0 0 0 0 0 ...
## $ quantity: num 22 22 20 18 18 17 16 14 14 13 ...
# Filter the batwing set for parts that are also in the batmobile set
batwing %>%
semi_join(batmobile, by=c("part_num"))
## # A tibble: 126 x 3
## part_num color_id quantity
## <chr> <dbl> <dbl>
## 1 3023 0 22
## 2 3024 0 22
## 3 3623 0 20
## 4 2780 0 17
## 5 3666 0 16
## 6 3710 0 14
## 7 6141 4 12
## 8 2412b 71 10
## 9 6141 72 10
## 10 6558 1 9
## # ... with 116 more rows
# Filter the batwing set for parts that aren't in the batmobile set
batwing %>%
anti_join(batmobile, by=c("part_num"))
## # A tibble: 183 x 3
## part_num color_id quantity
## <chr> <dbl> <dbl>
## 1 11477 0 18
## 2 99207 71 18
## 3 22385 0 14
## 4 99563 0 13
## 5 10247 72 12
## 6 2877 72 12
## 7 61409 72 12
## 8 11153 0 10
## 9 98138 46 10
## 10 2419 72 9
## # ... with 173 more rows
# Use inventory_parts to find colors included in at least one set
colors %>%
semi_join(inventory_parts, by=c("id"="color_id"))
## # A tibble: 134 x 3
## id name rgb
## <dbl> <chr> <chr>
## 1 -1 [Unknown] #0033B2
## 2 0 Black #05131D
## 3 1 Blue #0055BF
## 4 2 Green #237841
## 5 3 Dark Turquoise #008F9B
## 6 4 Red #C91A09
## 7 5 Dark Pink #C870A0
## 8 6 Brown #583927
## 9 7 Light Gray #9BA19D
## 10 8 Dark Gray #6D6E5C
## # ... with 124 more rows
# Use filter() to extract version 1
version_1_inventories <- inventories %>%
filter(version==1)
# Use anti_join() to find which set is missing a version 1
sets %>%
anti_join(version_1_inventories, by=c("set_num"))
## # A tibble: 1 x 4
## set_num name year theme_id
## <chr> <chr> <dbl> <dbl>
## 1 40198-1 Ludo game 2018 598
(inventory_parts_themes <- inventories %>%
inner_join(inventory_parts, by = c("id" = "inventory_id")) %>%
arrange(desc(quantity)) %>%
select(-id, -version) %>%
inner_join(sets, by = "set_num") %>%
inner_join(themes, by = c("theme_id" = "id"), suffix = c("_set", "_theme")))
## # A tibble: 258,958 x 9
## set_num part_num color_id quantity name_set year theme_id name_theme
## <chr> <chr> <dbl> <dbl> <chr> <dbl> <dbl> <chr>
## 1 40179-1 3024 72 900 Persona~ 2016 277 Mosaic
## 2 40179-1 3024 15 900 Persona~ 2016 277 Mosaic
## 3 40179-1 3024 0 900 Persona~ 2016 277 Mosaic
## 4 40179-1 3024 71 900 Persona~ 2016 277 Mosaic
## 5 40179-1 3024 14 900 Persona~ 2016 277 Mosaic
## 6 k34434~ 3024 15 810 Lego Mo~ 2003 277 Mosaic
## 7 21010-1 3023 320 771 Robie H~ 2011 252 Architect~
## 8 k34431~ 3024 0 720 Lego Mo~ 2003 277 Mosaic
## 9 42083-1 2780 0 684 Bugatti~ 2018 5 Model
## 10 k34434~ 3024 0 540 Lego Mo~ 2003 277 Mosaic
## # ... with 258,948 more rows, and 1 more variable: parent_id <dbl>
batman_colors <- inventory_parts_themes %>%
# Filter the inventory_parts_themes table for the Batman theme
filter(name_theme=="Batman") %>%
group_by(color_id) %>%
summarize(total = sum(quantity)) %>%
# Add a percent column of the total divided by the sum of the total
mutate(percent=total/sum(total))
# Filter and aggregate the Star Wars set data; add a percent column
star_wars_colors <- inventory_parts_themes %>%
filter(name_theme=="Star Wars") %>%
group_by(color_id) %>%
summarize(total = sum(quantity)) %>%
mutate(percent=total/sum(total))
(colors_joined <- batman_colors %>%
full_join(star_wars_colors, by = "color_id", suffix = c("_batman", "_star_wars")) %>%
replace_na(list(total_batman = 0, total_star_wars = 0, percent_batman=0, percent_star_wars=0)) %>%
inner_join(colors, by = c("color_id" = "id")) %>%
# Create the difference and total columns
mutate(difference = percent_batman - percent_star_wars, total = total_batman + total_star_wars) %>%
# Filter for totals greater than 200
filter(total >= 200))
## # A tibble: 16 x 9
## color_id total_batman percent_batman total_star_wars percent_star_wa~ name
## <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 0 2807 0.296 3258 0.207 Black
## 2 1 243 0.0256 410 0.0261 Blue
## 3 4 529 0.0558 434 0.0276 Red
## 4 14 426 0.0449 207 0.0132 Yell~
## 5 15 404 0.0426 1771 0.113 White
## 6 19 142 0.0150 1012 0.0644 Tan
## 7 28 98 0.0103 183 0.0116 Dark~
## 8 36 86 0.00907 246 0.0156 Tran~
## 9 46 200 0.0211 39 0.00248 Tran~
## 10 70 297 0.0313 373 0.0237 Redd~
## 11 71 1148 0.121 3264 0.208 Ligh~
## 12 72 1453 0.153 2433 0.155 Dark~
## 13 84 278 0.0293 31 0.00197 Medi~
## 14 179 154 0.0162 232 0.0148 Flat~
## 15 378 22 0.00232 430 0.0273 Sand~
## 16 7 0 0 209 0.0133 Ligh~
## # ... with 3 more variables: rgb <chr>, difference <dbl>, total <dbl>
color_palette <- c('#05131D', '#0055BF', '#C91A09', '#F2CD37', '#FFFFFF', '#E4CD9E', '#958A73', '#C91A09', '#F5CD2F', '#582A12', '#A0A5A9', '#6C6E68', '#CC702A', '#898788', '#A0BCAC', '#D3D3D3')
names(color_palette) <- c('Black', 'Blue', 'Red', 'Yellow', 'White', 'Tan', 'Dark Tan', 'Trans-Red', 'Trans-Yellow', 'Reddish Brown', 'Light Bluish Gray', 'Dark Bluish Gray', 'Medium Dark Flesh', 'Flat Silver', 'Sand Green', 'Light Gray')
color_palette
## Black Blue Red Yellow
## "#05131D" "#0055BF" "#C91A09" "#F2CD37"
## White Tan Dark Tan Trans-Red
## "#FFFFFF" "#E4CD9E" "#958A73" "#C91A09"
## Trans-Yellow Reddish Brown Light Bluish Gray Dark Bluish Gray
## "#F5CD2F" "#582A12" "#A0A5A9" "#6C6E68"
## Medium Dark Flesh Flat Silver Sand Green Light Gray
## "#CC702A" "#898788" "#A0BCAC" "#D3D3D3"
# Create a bar plot using colors_joined and the name and difference columns
ggplot(colors_joined, aes(x=reorder(name, difference), y=difference, fill = name)) +
geom_col() +
coord_flip() +
scale_fill_manual(values = color_palette, guide = FALSE) +
labs(y = "Difference: Batman - Star Wars")
Chapter 4 - Case Study: Stack Overflow
Stack Overflow Questions:
Joining Questions and Answers:
The bind_rows verb:
Wrap up:
Example code includes:
questions <- readRDS("./RInputFiles/questions.rds")
tags <- readRDS("./RInputFiles/tags.rds")
question_tags <- readRDS("./RInputFiles/question_tags.rds")
answers <- readRDS("./RInputFiles/answers.rds")
# Replace the NAs in the tag_name column
questions_with_tags <- questions %>%
left_join(question_tags, by = c("id" = "question_id")) %>%
left_join(tags, by = c("tag_id" = "id")) %>%
replace_na(list(tag_name="only-r"))
questions_with_tags %>%
# Group by tag_name
group_by(tag_name) %>%
# Get mean score and num_questions
summarize(score = mean(score), num_questions = n()) %>%
# Sort num_questions in descending order
arrange(-num_questions)
## # A tibble: 7,841 x 3
## tag_name score num_questions
## <chr> <dbl> <int>
## 1 only-r 1.26 48541
## 2 ggplot2 2.61 28228
## 3 dataframe 2.31 18874
## 4 shiny 1.45 14219
## 5 dplyr 1.95 14039
## 6 plot 2.24 11315
## 7 data.table 2.97 8809
## 8 matrix 1.66 6205
## 9 loops 0.743 5149
## 10 regex 2 4912
## # ... with 7,831 more rows
# Using a join, filter for tags that are never on an R question
tags %>%
anti_join(question_tags, by=c("id"="tag_id"))
## # A tibble: 40,459 x 2
## id tag_name
## <dbl> <chr>
## 1 124399 laravel-dusk
## 2 124402 spring-cloud-vault-config
## 3 124404 spring-vault
## 4 124405 apache-bahir
## 5 124407 astc
## 6 124408 simulacrum
## 7 124410 angulartics2
## 8 124411 django-rest-viewsets
## 9 124414 react-native-lightbox
## 10 124417 java-module
## # ... with 40,449 more rows
questions %>%
# Inner join questions and answers with proper suffixes
inner_join(answers, by=c("id"="question_id"), suffix=c("_question", "_answer")) %>%
# Subtract creation_date_question from creation_date_answer to create gap
mutate(gap = as.integer(creation_date_answer-creation_date_question))
## # A tibble: 380,643 x 7
## id creation_date_q~ score_question id_answer creation_date_a~
## <int> <date> <int> <int> <date>
## 1 2.26e7 2014-03-21 1 22560670 2014-03-21
## 2 2.26e7 2014-03-21 2 22558516 2014-03-21
## 3 2.26e7 2014-03-21 2 22558726 2014-03-21
## 4 2.26e7 2014-03-21 2 22558085 2014-03-21
## 5 2.26e7 2014-03-21 2 22606545 2014-03-24
## 6 2.26e7 2014-03-21 2 22610396 2014-03-24
## 7 2.26e7 2014-03-21 2 34374729 2015-12-19
## 8 2.26e7 2014-03-21 2 22559327 2014-03-21
## 9 2.26e7 2014-03-21 2 22560102 2014-03-21
## 10 2.26e7 2014-03-21 2 22560288 2014-03-21
## # ... with 380,633 more rows, and 2 more variables: score_answer <int>,
## # gap <int>
# Count and sort the question id column in the answers table
answer_counts <- answers %>%
count(question_id, sort=TRUE)
# Combine the answer_counts and questions tables
question_answer_counts <- questions %>%
left_join(answer_counts, by=c("id"="question_id")) %>%
# Replace the NAs in the n column
replace_na(list(n=0))
tagged_answers <- question_answer_counts %>%
# Join the question_tags tables
inner_join(question_tags, by=c("id"="question_id")) %>%
# Join the tags table
inner_join(tags, by=c("tag_id"="id"))
tagged_answers %>%
# Aggregate by tag_name
group_by(tag_name) %>%
# Summarize questions and average_answers
summarize(questions = n(), average_answers = mean(n)) %>%
# Sort the questions in descending order
arrange(-questions)
## # A tibble: 7,840 x 3
## tag_name questions average_answers
## <chr> <int> <dbl>
## 1 ggplot2 28228 1.15
## 2 dataframe 18874 1.67
## 3 shiny 14219 0.921
## 4 dplyr 14039 1.55
## 5 plot 11315 1.23
## 6 data.table 8809 1.47
## 7 matrix 6205 1.45
## 8 loops 5149 1.39
## 9 regex 4912 1.91
## 10 function 4892 1.30
## # ... with 7,830 more rows
# Inner join the question_tags and tags tables with the questions table
questions_with_tags <- questions %>%
inner_join(question_tags, by = c("id"="question_id")) %>%
inner_join(tags, by = c("tag_id"="id"))
# Inner join the question_tags and tags tables with the answers table
answers_with_tags <- answers %>%
inner_join(question_tags, by = c("question_id"="question_id")) %>%
inner_join(tags, by = c("tag_id"="id"))
# Combine the two tables into posts_with_tags
posts_with_tags <- bind_rows(questions_with_tags %>% mutate(type = "question"), answers_with_tags %>% mutate(type = "answer"))
# Add a year column, then aggregate by type, year, and tag_name
by_type_year_tag <- posts_with_tags %>%
mutate(year=lubridate::year(creation_date)) %>%
count(type, year, tag_name)
# Filter for the dplyr and ggplot2 tag names
by_type_year_tag_filtered <- by_type_year_tag %>%
filter(tag_name %in% c("dplyr", "ggplot2"))
# Create a line plot faceted by the tag name
ggplot(by_type_year_tag_filtered, aes(x=year, y=n, color = type)) +
geom_line() +
facet_wrap(~ tag_name)
Chapter 1 - Introducing TensorFlow in R
What is TensorFlow?
TensorFlow Syntax, Variables, and Placeholders:
TensorBoard - Visualizing TensorFlow Models:
Example code includes:
# Miniconda has been successfully installed at "C:/.../AppData/Local/r-miniconda".
# Need to install and PATH tensorflow for this to work
library(tensorflow)
# Create your session
sess <- tf$Session()
# Define a constant (you'll learn this next!)
HiThere <- tf$constant('Hi DataCamp Student!')
# Run your session with the HiThere constant
print(sess$run(HiThere))
# Close the session
sess$close()
# Create two constant tensors
myfirstconstanttensor <- tf$constant(152)
mysecondconstanttensor <- tf$constant('I am a tensor master!')
# Create a matrix of zeros
myfirstvariabletensor <- tf$Variable(tf$zeros(shape(5, 1)))
# Set up your session
EmployeeSession <- tf$Session()
# Add your constants
female <- tf$constant(150, name = "FemaleEmployees")
male <- tf$constant(135, name = "MaleEmployees")
total <- tf$add(female, male)
print(EmployeeSession$run(total))
# Write to file
towrite <- tf$summary$FileWriter('./graphs', EmployeeSession$graph)
# Open Tensorboard
tensorboard(log_dir = './graphs')
# From last exercise
total <- tf$add(female,male)
# Multiply your allemps by growth projections
growth <- tf$constant(1.32, name = "EmpGrowth")
EmpGrowth <- tf$math$multiply(total, growth)
print(EmployeeSession$run(EmpGrowth))
# Write to file
towrite <- tf$summary$FileWriter('./graphs', EmployeeSession$graph)
# Open Tensorboard
tensorboard(log_dir = './graphs')
# Start Session
sess <- tf$Session()
# Create 2 constants
a <- tf$constant(10)
b <- tf$constant(32)
# Add your two constants together
sess$run(a + b)
# Create a Variable
mytestvariable <- tf$Variable(tf$zeros(shape(1L)))
# Run the last line
mytestvariable
Chapter 2 - Linear Regression Using Two TensorFlow API
Core API: Linear Regression:
Core API: Linear Regression Part II:
Core API: Linear Regression Part III:
sess$run(train) if (step %% 500 == 0) cat("Step = ", step, "Estimate w = ", sess$run(w), "Estimate b = ", sess$run(b)) Estimators API: Multiple Linear Regression:
Example code includes:
# Parse out the minimum study time and final percent in x_data and y_data variables
x_data <- studentgradeprediction_train$minstudytime
y_data <- studentgradeprediction_train$Finalpercent
# Define your w variable
w <- tf$Variable(tf$random_uniform(shape(1L), -1.0, 1.0))
# Define your b variable
b <- tf$Variable(tf$zeros(shape(1L)))
# Define your linear equation
y <- w * x_data + b
# Define cost function
loss <- tf$reduce_mean((y-y_data)^2)
# Use the Gradient Descent Optimizer
optimizer <- tf$train$GradientDescentOptimizer(0.0001)
# Minimize MSE loss
train <- optimizer$minimize(loss)
# Launch new session
Finalgradessession <- tf$Session()
# Initialize (run) global variables
Finalgradessession$run(tf$global_variables_initializer())
# Train your model
for (step in 1:3750) {
Finalgradessession$run(train)
if (step %% 750 == 0) cat("Step = ", step, "Estimate w = ", Finalgradessession$run(w), "Estimate b =", Finalgradessession$run(b), "\n")
}
# Calculate the predicted grades
grades_actual <- studentgradeprediction_test$Finalpercent
grades_predicted <- as.vector(Finalgradessession$run(w)) *
studentgradeprediction_test$minstudytime +
as.vector(Finalgradessession$run(b))
# Plot the actual and predicted grades
plot(grades_actual, grades_predicted, pch=19, col='red')
# Run a correlation
cor(grades_actual, grades_predicted)
# Define all four of your feature columns
ftr_colns <- feature_columns(
)
# Choose the correct model
grademodel <- linear_regressor(feature_columns = ftr_colns)
# Define your input function
grade_input_fn <- function(data){
}
# Train your model
train(grademodel, grade_input_fn(train))
# Evaluate your model
model_eval <- evaluate(grademodel, grade_input_fn(test))
# See the results
model_eval
# Calculate the predictions
predictoutput <- predict(grademodel, input_fn=grademodel_input_fn(studentgradeprediction_test))
# Plot actual and predicted values
plot(studentgradeprediction_test$Finalpercent, as.numeric(predictoutput$predictions),
xlab = "actual_grades", ylab = "predicted_grades", pch=19, col='red'
)
# Calculate the correlation
cor(as.numeric(predictoutput$predictions), studentgradeprediction_test$Finalpercent)
Chapter 3 - Deep Learning in TensorFlow: Creating a Deep Neural Network
Gentle Introduction to Neural Networks:
Deep Neural Networks Using Keras API:
Evaluate, Predict, Visualize Model:
Create DNN Using Estimators API:
Example code includes:
# Define the model
model <- keras_model_sequential()
model %>%
layer_dense(units=15, activation = 'relu', input_shape = 8) %>%
layer_dense(units=5, activation = 'relu') %>%
layer_dense(units=1)
# Compile the model
model %>%
compile(optimizer = 'rmsprop', loss = 'mse', metrics = c('accuracy'))
# Fit the model
model %>%
fit(x = train_x, y = train_y, epochs = 25, batch_size=32, validation_split = .2)
# Evaluate the model
score <- model %>%
evaluate(test_x, test_y)
# Call up the accuracy
score$acc
# Predict based on your model
predictedclasses <- model %>%
predict_classes(newdata_x)
# Print predicted classes with customers' names
rownames(predictedclasses) <- c('Jasmit', 'Banjeet')
predictedclasses
# Fit the model and define callbacks
model %>%
fit(x = train_x, y = train_y,epochs = 25, batch_size = 32, validation_split = .2,
callbacks = callback_tensorboard("logs/run_1")
)
# Call TensorBoard
tensorboard("logs/run_1")
# Train the model
train(dnnclassifier, input_fn = shopping_input_function(shopper_train))
# Evaluate the model by correcting the error
evaluate(dnnclassifier, input_fn = shopping_input_function(shopper_test))
# Create a sequential model and the network architecture
ourdnnmodel <- keras_model_sequential() %>%
layer_dense(units = 10, activation = "relu", input_shape = ncol(train_x)) %>%
layer_dense(units = 5, activation = "relu") %>%
layer_dense(units = 1) %>%
compile(optimizer = 'rmsprop', loss = 'mse', metrics = c("mae", "accuracy"))
# Fit your model
learn <- ourdnnmodel %>%
fit(x = train_x, y = train_y, epochs = 25, batch_size = 32, validation_split = 0.2, verbose = FALSE)
# Run the learn function
learn
Chapter 4 - Deep Learning in TensorFlow: Increasing Model Accuracy
L2 Regularization Using Keras:
Dropout Technique Using TFEstimators:
Hyperparameter Tuning with tfruns:
Wrap Up:
Example code includes:
# Define the model
model_lesson1 <- keras_model_sequential()
# Add the regularizer
model_lesson1 %>%
layer_dense(units=15, activation='relu', input_shape=8, kernel_regularizer=regularizer_l2(l=0.1)) %>%
layer_dense(units=5, activation = 'relu') %>%
layer_dense(units=1)
# Compile the model
model_lesson1 %>%
compile(optimizer = 'rmsprop', loss = 'mse', metrics = c('accuracy'))
# Fit the model
model_lesson1 %>%
fit(x = train_x, y = train_y, epochs = 25, batch_size = 32, validation_split=0.2)
# Evaluate the model
score_lesson1 <- model_lesson1 %>%
evaluate(test_x, test_y)
# Call the accuracy and loss
score_lesson1$acc
score_lesson1$loss
# Define the feature columns
featcols <- feature_columns(
tf$feature_column$numeric_column("Var"), tf$feature_column$numeric_column("Skew"),
tf$feature_column$numeric_column("Kurt"), tf$feature_column$numeric_column("Entropy")
)
# Create the input function
banknote_input_fn <- function(data){
input_fn(data, features = c("Var", "Skew", "Kurt", "Entropy"), response = "Class")
}
# Create your dnn_classifier model
mymodel <- dnn_classifier(feature_columns = featcols, hidden_units = c(40, 60, 10), n_classes = 2,
label_vocabulary = c("N", "Y"), dropout = 0.2
)
# Train the model
train(mymodel, input_fn = banknote_input_fn(banknote_authentication_train))
# Evaluate your model using the testing dataset
final_evaluation <- evaluate(mymodel, input_fn = banknote_input_fn(banknote_authentication_test))
# Call up the accuracy and precision of your evaluated model
final_evaluation$accuracy
final_evaluation$precision
# Tune the run
runs <- tuning_run(modelsourcecode_script, flags = list(dropout = c(0.2, 0.3, 0.4)))
# View the outcome
runs[order(runs$eval_accuracy, decreasing = TRUE), ]
# Tune the run
runs <- tuning_run(
modelsourcecode_script, flags = list(dropout = c(0.2, 0.3, 0.4), activation = c("relu", "softmax") )
)
# View the outcome
runs[order(runs$eval_accuracy, decreasing = TRUE), ]
Chapter 1 - Introduction to Market Basket Analysis
Market Basket Introduction:
Item Combinations:
What is Market Basket Analysis?
Example code includes:
Online_Retail_2011_Q1 <- readr::read_csv("./RInputFiles/Online_Retail_2011_Q1.xls")
## Parsed with column specification:
## cols(
## InvoiceNo = col_character(),
## StockCode = col_character(),
## Description = col_character(),
## Quantity = col_double(),
## InvoiceDate = col_character(),
## UnitPrice = col_double(),
## CustomerID = col_double(),
## Country = col_character()
## )
str(Online_Retail_2011_Q1)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 99602 obs. of 8 variables:
## $ InvoiceNo : chr "539993" "539993" "539993" "539993" ...
## $ StockCode : chr "22386" "21499" "21498" "22379" ...
## $ Description: chr "JUMBO BAG PINK POLKADOT" "BLUE POLKADOT WRAP" "RED RETROSPOT WRAP" "RECYCLING BAG RETROSPOT" ...
## $ Quantity : num 10 25 25 5 10 10 6 12 6 8 ...
## $ InvoiceDate: chr "04/01/2011 10:00" "04/01/2011 10:00" "04/01/2011 10:00" "04/01/2011 10:00" ...
## $ UnitPrice : num 1.95 0.42 0.42 2.1 1.25 1.95 3.25 1.45 2.95 1.95 ...
## $ CustomerID : num 13313 13313 13313 13313 13313 ...
## $ Country : chr "United Kingdom" "United Kingdom" "United Kingdom" "United Kingdom" ...
## - attr(*, "spec")=
## .. cols(
## .. InvoiceNo = col_character(),
## .. StockCode = col_character(),
## .. Description = col_character(),
## .. Quantity = col_double(),
## .. InvoiceDate = col_character(),
## .. UnitPrice = col_double(),
## .. CustomerID = col_double(),
## .. Country = col_character()
## .. )
movie_subset <- readr::read_csv("./RInputFiles/Movie_subset.xls")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## X1 = col_double(),
## userId = col_double(),
## movieId = col_double(),
## title = col_character(),
## year = col_double(),
## genres = col_character()
## )
str(movie_subset)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 19455 obs. of 6 variables:
## $ X1 : num 1 2 3 4 5 6 7 8 9 10 ...
## $ userId : num 1323 1323 1323 1323 1323 ...
## $ movieId: num 1 3 5 10 11 12 15 16 17 19 ...
## $ title : chr "Toy Story" "Grumpier Old Men" "Father of the Bride Part II" "GoldenEye" ...
## $ year : num 1995 1995 1995 1995 1995 ...
## $ genres : chr "Adventure|Animation|Children|Comedy|Fantasy" "Comedy|Romance" "Comedy" "Action|Adventure|Thriller" ...
## - attr(*, "spec")=
## .. cols(
## .. X1 = col_double(),
## .. userId = col_double(),
## .. movieId = col_double(),
## .. title = col_character(),
## .. year = col_double(),
## .. genres = col_character()
## .. )
# Have a glimpse at the dataset
glimpse(Online_Retail_2011_Q1)
## Observations: 99,602
## Variables: 8
## $ InvoiceNo <chr> "539993", "539993", "539993", "539993", "539993", "5399...
## $ StockCode <chr> "22386", "21499", "21498", "22379", "20718", "85099B", ...
## $ Description <chr> "JUMBO BAG PINK POLKADOT", "BLUE POLKADOT WRAP", "RED R...
## $ Quantity <dbl> 10, 25, 25, 5, 10, 10, 6, 12, 6, 8, 6, 6, 6, 12, 12, 8,...
## $ InvoiceDate <chr> "04/01/2011 10:00", "04/01/2011 10:00", "04/01/2011 10:...
## $ UnitPrice <dbl> 1.95, 0.42, 0.42, 2.10, 1.25, 1.95, 3.25, 1.45, 2.95, 1...
## $ CustomerID <dbl> 13313, 13313, 13313, 13313, 13313, 13313, 13313, 13313,...
## $ Country <chr> "United Kingdom", "United Kingdom", "United Kingdom", "...
# Filter a single basket
One_basket = Online_Retail_2011_Q1 %>%
filter(InvoiceNo == 540180)
print(One_basket)
## # A tibble: 12 x 8
## InvoiceNo StockCode Description Quantity InvoiceDate UnitPrice CustomerID
## <chr> <chr> <chr> <dbl> <chr> <dbl> <dbl>
## 1 540180 85123A WHITE HANG~ 2 05/01/2011~ 2.95 15984
## 2 540180 22083 PAPER CHAI~ 2 05/01/2011~ 2.95 15984
## 3 540180 22759 SET OF 3 N~ 4 05/01/2011~ 1.65 15984
## 4 540180 21677 HEARTS ST~ 4 05/01/2011~ 0.85 15984
## 5 540180 22168 ORGANISER ~ 1 05/01/2011~ 8.5 15984
## 6 540180 22113 GREY HEART~ 4 05/01/2011~ 3.75 15984
## 7 540180 84978 HANGING HE~ 6 05/01/2011~ 1.25 15984
## 8 540180 22558 CLOTHES PE~ 1 05/01/2011~ 1.49 15984
## 9 540180 22163 HEART STRI~ 1 05/01/2011~ 2.95 15984
## 10 540180 22164 STRING OF ~ 1 05/01/2011~ 2.95 15984
## 11 540180 85123A WHITE HANG~ 6 05/01/2011~ 2.95 15984
## 12 540180 22297 HEART IVOR~ 24 05/01/2011~ 1.25 15984
## # ... with 1 more variable: Country <chr>
# Basket size
n_distinct(One_basket$StockCode)
## [1] 11
# Total number of items purchased
One_basket %>%
summarize(sum(Quantity))
## # A tibble: 1 x 1
## `sum(Quantity)`
## <dbl>
## 1 56
# Plot the total number of items within the basket
ggplot(One_basket, aes(x=reorder(Description, Quantity, function(x) sum(x)), y = Quantity)) +
geom_col() +
coord_flip() +
xlab("Items")
# Number of items
n_items = 10
# Initialize an empty matrix
combi = matrix(NA, nrow = n_items+1, ncol = 2)
# Loop over all values of k
for (i in 0:n_items){
combi[i+1, ] = c(i, choose(n_items, i))
}
# Sum over all values of k
sum(combi[, 2])
## [1] 1024
# Total number of possible baskets
2^10
## [1] 1024
# Define number of items
n_items = 100
# Specify the function to be plotted
fun_combi = function(x) choose(n_items, x)
# Plot the number of combinations
ggplot(data = data.frame(x = 0), mapping = aes(x = x)) +
stat_function(fun = fun_combi) + xlim(0, n_items)
# Select two baskets
Two_baskets = Online_Retail_2011_Q1 %>%
filter(InvoiceNo %in% c(540160, 540017))
# Basket size
Two_baskets %>%
group_by(InvoiceNo) %>%
summarise(n_total = n(), n_items = n_distinct(StockCode))
## # A tibble: 2 x 3
## InvoiceNo n_total n_items
## <chr> <int> <int>
## 1 540017 13 13
## 2 540160 3 3
Online_Retail_clean <- Online_Retail_2011_Q1[complete.cases(Online_Retail_2011_Q1), ]
str(Online_Retail_clean)
## Classes 'tbl_df', 'tbl' and 'data.frame': 70097 obs. of 8 variables:
## $ InvoiceNo : chr "539993" "539993" "539993" "539993" ...
## $ StockCode : chr "22386" "21499" "21498" "22379" ...
## $ Description: chr "JUMBO BAG PINK POLKADOT" "BLUE POLKADOT WRAP" "RED RETROSPOT WRAP" "RECYCLING BAG RETROSPOT" ...
## $ Quantity : num 10 25 25 5 10 10 6 12 6 8 ...
## $ InvoiceDate: chr "04/01/2011 10:00" "04/01/2011 10:00" "04/01/2011 10:00" "04/01/2011 10:00" ...
## $ UnitPrice : num 1.95 0.42 0.42 2.1 1.25 1.95 3.25 1.45 2.95 1.95 ...
## $ CustomerID : num 13313 13313 13313 13313 13313 ...
## $ Country : chr "United Kingdom" "United Kingdom" "United Kingdom" "United Kingdom" ...
# Create dataset with basket counts and inspect results
basket_size = Online_Retail_clean %>%
group_by(InvoiceNo) %>%
summarise(n_total = n(), n_items = n_distinct(StockCode))
head(basket_size)
## # A tibble: 6 x 3
## InvoiceNo n_total n_items
## <chr> <int> <int>
## 1 539993 17 17
## 2 540001 9 9
## 3 540002 4 4
## 4 540003 22 22
## 5 540004 1 1
## 6 540005 16 14
# Calculate average values
basket_size %>%
summarize(avg_total_items = mean(n_total), avg_dist_items = mean(n_items))
## # A tibble: 1 x 2
## avg_total_items avg_dist_items
## <dbl> <dbl>
## 1 17.3 16.9
# Distribution of distinct items in baskets
ggplot(basket_size, aes(x=n_items)) +
geom_histogram() + ggtitle("Distribution of basket sizes")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Number of total and distinct items for HERB MARKER THYME
Online_Retail_clean %>%
filter(Description == "HERB MARKER THYME") %>%
summarise(n_tot_items = n(), n_basket_item = n_distinct(InvoiceNo))
## # A tibble: 1 x 2
## n_tot_items n_basket_item
## <int> <int>
## 1 53 52
# Number of baskets containing both items
Online_Retail_clean %>%
filter(Description %in% c("HERB MARKER ROSEMARY", "HERB MARKER THYME")) %>%
group_by(InvoiceNo) %>%
summarise(n = n()) %>%
filter(n==2) %>%
summarise(n_distinct(InvoiceNo))
## # A tibble: 1 x 1
## `n_distinct(InvoiceNo)`
## <int>
## 1 48
Chapter 2 - Metrics and Techniques in Market Basket Analysis
Transactional Data:
Metrics in Market Basket Analysis:
The Apriori Algorithm:
Using Apriori for “if this then that”:
Example code includes:
library(arules)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
## Attaching package: 'arules'
## The following object is masked from 'package:dplyr':
##
## recode
## The following objects are masked from 'package:base':
##
## abbreviate, write
# Splitting transactions
data_list = split(Online_Retail_clean$Description, Online_Retail_clean$InvoiceNo)
# Transform data into a transactional dataset
Online_trx = as(data_list, "transactions")
## Warning in asMethod(object): removing duplicated items in transactions
# Summary of transactions
summary(Online_trx)
## transactions as itemMatrix in sparse format with
## 4057 rows (elements/itemsets/transactions) and
## 2662 columns (items) and a density of 0.006350527
##
## most frequent items:
## WHITE HANGING HEART T-LIGHT HOLDER REGENCY CAKESTAND 3 TIER
## 460 456
## SET OF 3 CAKE TINS PANTRY DESIGN JUMBO BAG RED RETROSPOT
## 432 292
## SET OF 6 SPICE TINS PANTRY DESIGN (Other)
## 279 66665
##
## element (itemset/transaction) length distribution:
## sizes
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## 635 271 210 144 153 97 109 120 117 102 119 87 83 88 90 112 83 78 104 71
## 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
## 77 73 60 53 33 41 45 45 53 43 35 27 28 35 24 22 16 19 28 17
## 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
## 27 22 18 18 13 14 9 18 14 14 12 13 12 10 9 8 9 7 3 12
## 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
## 4 3 8 9 4 4 7 6 5 6 6 4 4 4 4 3 2 5 2 2
## 81 82 83 85 86 87 89 92 93 94 95 96 97 98 100 101 104 106 107 108
## 1 4 1 2 2 1 6 1 3 1 1 1 1 4 1 2 1 2 2 2
## 110 111 114 116 118 120 124 130 131 149 151 153 171 227 270
## 2 1 1 1 1 1 1 1 1 2 1 1 1 1 1
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 3.00 11.00 16.91 23.00 270.00
##
## includes extended item information - examples:
## labels
## 1 10 COLOUR SPACEBOY PEN
## 2 12 COLOURED PARTY BALLOONS
## 3 12 DAISY PEGS IN WOOD BOX
##
## includes extended transaction information - examples:
## transactionID
## 1 539993
## 2 540001
## 3 540002
# inspect first 3 transactions
inspect(head(Online_trx, 3))
## items transactionID
## [1] {BLUE POLKADOT WRAP,
## CAST IRON HOOK GARDEN FORK,
## CHILDRENS APRON APPLES DESIGN,
## COFFEE MUG APPLES DESIGN,
## COFFEE MUG PEARS DESIGN,
## JAM MAKING SET PRINTED,
## JUMBO BAG PINK POLKADOT,
## JUMBO BAG RED RETROSPOT,
## LOVE HEART NAPKIN BOX,
## PEG BAG APPLES DESIGN,
## RECIPE BOX RETROSPOT,
## RECYCLING BAG RETROSPOT,
## RED RETROSPOT CHILDRENS UMBRELLA,
## RED RETROSPOT SHOPPER BAG,
## RED RETROSPOT WRAP,
## SET OF 6 T-LIGHTS EASTER CHICKS,
## WHITE HANGING HEART T-LIGHT HOLDER} 539993
## [2] {CERAMIC BOWL WITH LOVE HEART DESIGN,
## CERAMIC CHERRY CAKE MONEY BANK,
## DOORSTOP RETROSPOT HEART,
## GINGHAM HEART DOORSTOP RED,
## LARGE CAKE STAND HANGING HEARTS,
## LOVE HEART POCKET WARMER,
## PLACE SETTING WHITE HEART,
## RED HANGING HEART T-LIGHT HOLDER,
## SWEETHEART CERAMIC TRINKET BOX} 540001
## [3] {GARDEN METAL SIGN,
## RED KITCHEN SCALES,
## VICTORIAN SEWING BOX SMALL,
## VINTAGE SNAP CARDS} 540002
# inspect last 5 transactions
inspect(tail(Online_trx, 5))
## items transactionID
## [1] {RED RETROSPOT BUTTER DISH,
## RED RETROSPOT TEA CUP AND SAUCER,
## SMALL RED RETROSPOT MUG IN BOX,
## SMALL WHITE RETROSPOT MUG IN BOX,
## STRAWBERRY FAIRY CAKE TEAPOT} C548503
## [2] {ABC TREASURE BOOK BOX} C548508
## [3] {WHITE HANGING HEART T-LIGHT HOLDER} C548513
## [4] {CREAM CUPID HEARTS COAT HANGER,
## RED RETROSPOT CAKE STAND,
## REGENCY CAKESTAND 3 TIER,
## WOODEN FRAME ANTIQUE WHITE,
## WOODEN PICTURE FRAME WHITE FINISH} C548532
## [5] {Manual,
## SILVER HANGING T-LIGHT HOLDER} C548543
# inspect transaction 10
inspect(Online_trx[10])
## items transactionID
## [1] {AGED GLASS SILVER T-LIGHT HOLDER,
## FLUTED ANTIQUE CANDLE HOLDER,
## LOVE HEART NAPKIN BOX,
## MULTI COLOUR SILVER T-LIGHT HOLDER,
## RED RETROSPOT MUG,
## RED RETROSPOT TRADITIONAL TEAPOT,
## RETROSPOT LARGE MILK JUG,
## SET 20 NAPKINS FAIRY CAKES DESIGN,
## SET/20 RED RETROSPOT PAPER NAPKINS,
## SET/5 RED RETROSPOT LID GLASS BOWLS,
## WHITE HANGING HEART T-LIGHT HOLDER} 540016
# Inspect specific transactions
inspect(Online_trx[c(12, 20, 22)])
## items transactionID
## [1] {12 PENCILS TALL TUBE RED RETROSPOT,
## 200 RED + WHITE BENDY STRAWS,
## 60 CAKE CASES DOLLY GIRL DESIGN,
## 72 SWEETHEART FAIRY CAKE CASES,
## BAG 500g SWIRLY MARBLES,
## BROWN CHECK CAT DOORSTOP,
## CALENDAR PAPER CUT DESIGN,
## COFFEE MUG APPLES DESIGN,
## COFFEE MUG PEARS DESIGN,
## CREAM WALL PLANTER HEART SHAPED,
## ENAMEL FLOWER JUG CREAM,
## KEY FOB , BACK DOOR,
## KEY FOB , GARAGE DESIGN,
## KEY FOB , SHED,
## MEMO BOARD RETROSPOT DESIGN,
## PACK OF 12 HEARTS DESIGN TISSUES,
## PACK OF 12 TRADITIONAL CRAYONS,
## PENS ASSORTED FUNNY FACE,
## POTTING SHED TEA MUG,
## RED RETROSPOT ROUND CAKE TINS,
## RETROSPOT LAMP,
## RETROSPOT TEA SET CERAMIC 11 PC,
## ROMANTIC PINKS RIBBONS,
## SET 12 KIDS COLOUR CHALK STICKS,
## SET OF 36 DINOSAUR PAPER DOILIES,
## SILVER HANGING T-LIGHT HOLDER,
## TEA TIME PARTY BUNTING,
## VINTAGE SNAKES & LADDERS,
## WHITE WOOD GARDEN PLANT LADDER} 540019
## [2] {BAKING SET 9 PIECE RETROSPOT,
## BREAD BIN DINER STYLE PINK,
## BREAD BIN DINER STYLE RED,
## CHILDS BREAKFAST SET DOLLY GIRL,
## FRENCH ENAMEL POT W LID,
## FRYING PAN RED RETROSPOT,
## GUMBALL MAGAZINE RACK,
## JARDIN ETCHED GLASS CHEESE DISH,
## RED RETROSPOT TRADITIONAL TEAPOT,
## RETROSPOT TEA SET CERAMIC 11 PC,
## SET OF 16 VINTAGE RED CUTLERY,
## TRIPLE PHOTO FRAME CORNICE,
## VICTORIAN SEWING BOX LARGE} 540028
## [3] {ALPHABET STENCIL CRAFT,
## ASSORTED COLOUR BIRD ORNAMENT,
## DOORMAT AIRMAIL,
## DOORMAT I LOVE LONDON,
## DOORMAT RESPECTABLE HOUSE,
## FANNY'S REST STOPMETAL SIGN,
## FELTCRAFT 6 FLOWER FRIENDS,
## FELTCRAFT BUTTERFLY HEARTS,
## HAPPY STENCIL CRAFT,
## HEART OF WICKER LARGE,
## HEART OF WICKER SMALL,
## HOME BUILDING BLOCK WORD,
## HOT BATHS METAL SIGN,
## I'M ON HOLIDAY METAL SIGN,
## JOY WOODEN BLOCK LETTERS,
## LAUNDRY 15C METAL SIGN,
## LAVENDER SCENTED FABRIC HEART,
## LOVE BUILDING BLOCK WORD,
## MAN FLU METAL SIGN,
## METAL SIGN TAKE IT OR LEAVE IT,
## NO JUNK MAIL METAL SIGN,
## PEACE WOODEN BLOCK LETTERS,
## PLEASE ONE PERSON METAL SIGN,
## ROSE FOLKART HEART DECORATIONS,
## UNION JACK FLAG LUGGAGE TAG,
## UNION JACK FLAG PASSPORT COVER,
## VICTORIAN METAL POSTCARD SPRING,
## WAKE UP COCKEREL CALENDAR SIGN,
## WOOD S/3 CABINET ANT WHITE FINISH,
## YOU'RE CONFUSING ME METAL SIGN} 540031
# Determine the support of both items with support 0.1
support_rosemary_thyme <- apriori(Online_trx, parameter = list(target = "frequent itemsets", supp = 0.1),
appearance = list(items = c("HERB MARKER ROSEMARY", "HERB MARKER THYME"))
)
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## NA 0.1 1 none FALSE TRUE 5 0.1 1
## maxlen target ext
## 10 frequent itemsets FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 405
##
## set item appearances ...[2 item(s)] done [0.00s].
## set transactions ...[2 item(s), 4057 transaction(s)] done [0.02s].
## sorting and recoding items ... [0 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 done [0.00s].
## writing ... [0 set(s)] done [0.00s].
## creating S4 object ... done [0.00s].
# Inspect the itemsets
inspect(support_rosemary_thyme)
# Determine the support of both items with support 0.01
support_rosemary_thyme <- apriori(Online_trx, parameter = list(target = "frequent itemsets", supp = 0.01),
appearance = list(items = c("HERB MARKER ROSEMARY", "HERB MARKER THYME"))
)
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## NA 0.1 1 none FALSE TRUE 5 0.01 1
## maxlen target ext
## 10 frequent itemsets FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 40
##
## set item appearances ...[2 item(s)] done [0.00s].
## set transactions ...[2 item(s), 4057 transaction(s)] done [0.05s].
## sorting and recoding items ... [2 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 done [0.00s].
## writing ... [3 set(s)] done [0.00s].
## creating S4 object ... done [0.00s].
# Inspect the itemsets
inspect(support_rosemary_thyme)
## items support count
## [1] {HERB MARKER ROSEMARY} 0.01257087 51
## [2] {HERB MARKER THYME} 0.01281735 52
## [3] {HERB MARKER ROSEMARY,HERB MARKER THYME} 0.01207789 49
# Frequent itemsets for all items
support_all <- apriori(Online_trx, parameter = list(target="frequent itemsets", supp = 0.01))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## NA 0.1 1 none FALSE TRUE 5 0.01 1
## maxlen target ext
## 10 frequent itemsets FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 40
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[2662 item(s), 4057 transaction(s)] done [0.04s].
## sorting and recoding items ... [524 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.01s].
## writing ... [854 set(s)] done [0.00s].
## creating S4 object ... done [0.00s].
# Inspect the 5 most frequent items
inspect(head(sort(support_all, by="support"), 5))
## items support count
## [1] {WHITE HANGING HEART T-LIGHT HOLDER} 0.11338427 460
## [2] {REGENCY CAKESTAND 3 TIER} 0.11239832 456
## [3] {SET OF 3 CAKE TINS PANTRY DESIGN} 0.10648262 432
## [4] {JUMBO BAG RED RETROSPOT} 0.07197437 292
## [5] {SET OF 6 SPICE TINS PANTRY DESIGN} 0.06877003 279
# Call the apriori function with apropriate parameters
rules_all <- apriori(Online_trx, parameter = list(supp=0.01, conf = 0.4))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.4 0.1 1 none FALSE TRUE 5 0.01 1
## maxlen target ext
## 10 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 40
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[2662 item(s), 4057 transaction(s)] done [0.04s].
## sorting and recoding items ... [524 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.01s].
## writing ... [384 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
# Call the apriori function with apropriate parameters
rules_all <- apriori(Online_trx, parameter = list(supp=0.01, conf = 0.4, minlen=2))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.4 0.1 1 none FALSE TRUE 5 0.01 2
## maxlen target ext
## 10 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 40
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[2662 item(s), 4057 transaction(s)] done [0.05s].
## sorting and recoding items ... [524 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.01s].
## writing ... [384 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
# Inspect the rules with highest confidence
inspect(head(sort(rules_all, by="confidence"), 5))
## lhs rhs support confidence lift count
## [1] {HERB MARKER PARSLEY,
## HERB MARKER ROSEMARY} => {HERB MARKER THYME} 0.01059896 0.9772727 76.24607 43
## [2] {HERB MARKER ROSEMARY} => {HERB MARKER THYME} 0.01207789 0.9607843 74.95965 49
## [3] {HERB MARKER MINT,
## HERB MARKER ROSEMARY} => {HERB MARKER THYME} 0.01109194 0.9574468 74.69926 45
## [4] {HERB MARKER MINT,
## HERB MARKER THYME} => {HERB MARKER ROSEMARY} 0.01109194 0.9574468 76.16395 45
## [5] {HERB MARKER PARSLEY,
## HERB MARKER THYME} => {HERB MARKER ROSEMARY} 0.01059896 0.9555556 76.01351 43
# Inspect the rules with highest lift
inspect(head(sort(rules_all, by="lift"), 5))
## lhs rhs support confidence lift count
## [1] {HERB MARKER PARSLEY,
## HERB MARKER ROSEMARY} => {HERB MARKER THYME} 0.01059896 0.9772727 76.24607 43
## [2] {HERB MARKER MINT,
## HERB MARKER THYME} => {HERB MARKER ROSEMARY} 0.01109194 0.9574468 76.16395 45
## [3] {HERB MARKER PARSLEY,
## HERB MARKER THYME} => {HERB MARKER ROSEMARY} 0.01059896 0.9555556 76.01351 43
## [4] {HERB MARKER ROSEMARY} => {HERB MARKER THYME} 0.01207789 0.9607843 74.95965 49
## [5] {HERB MARKER THYME} => {HERB MARKER ROSEMARY} 0.01207789 0.9423077 74.95965 49
# Find the confidence and lift measures
rules_rosemary_rhs <- apriori(Online_trx, parameter = list(supp=0.01, conf=0.5, minlen=2),
appearance = list(rhs="HERB MARKER ROSEMARY", default = "lhs")
)
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.5 0.1 1 none FALSE TRUE 5 0.01 2
## maxlen target ext
## 10 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 40
##
## set item appearances ...[1 item(s)] done [0.00s].
## set transactions ...[2662 item(s), 4057 transaction(s)] done [0.06s].
## sorting and recoding items ... [524 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [7 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
# Inspect the rules
inspect(rules_rosemary_rhs)
## lhs rhs support confidence lift count
## [1] {HERB MARKER BASIL} => {HERB MARKER ROSEMARY} 0.01035248 0.8936170 71.08636 42
## [2] {HERB MARKER PARSLEY} => {HERB MARKER ROSEMARY} 0.01084545 0.8979592 71.43177 44
## [3] {HERB MARKER THYME} => {HERB MARKER ROSEMARY} 0.01207789 0.9423077 74.95965 49
## [4] {HERB MARKER MINT} => {HERB MARKER ROSEMARY} 0.01158491 0.8867925 70.54347 47
## [5] {HERB MARKER PARSLEY,
## HERB MARKER THYME} => {HERB MARKER ROSEMARY} 0.01059896 0.9555556 76.01351 43
## [6] {HERB MARKER MINT,
## HERB MARKER PARSLEY} => {HERB MARKER ROSEMARY} 0.01010599 0.9318182 74.12522 41
## [7] {HERB MARKER MINT,
## HERB MARKER THYME} => {HERB MARKER ROSEMARY} 0.01109194 0.9574468 76.16395 45
# Find the confidence and lift measures
rules_rosemary_lhs <- apriori(Online_trx, parameter = list(supp=0.01, conf=0.5, minlen=2),
appearance = list(lhs="HERB MARKER ROSEMARY", default = "rhs")
)
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.5 0.1 1 none FALSE TRUE 5 0.01 2
## maxlen target ext
## 10 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 40
##
## set item appearances ...[1 item(s)] done [0.00s].
## set transactions ...[2662 item(s), 4057 transaction(s)] done [0.04s].
## sorting and recoding items ... [524 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 done [0.00s].
## writing ... [4 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
# Inspect the rules
inspect(rules_rosemary_lhs)
## lhs rhs support confidence
## [1] {HERB MARKER ROSEMARY} => {HERB MARKER BASIL} 0.01035248 0.8235294
## [2] {HERB MARKER ROSEMARY} => {HERB MARKER PARSLEY} 0.01084545 0.8627451
## [3] {HERB MARKER ROSEMARY} => {HERB MARKER THYME} 0.01207789 0.9607843
## [4] {HERB MARKER ROSEMARY} => {HERB MARKER MINT} 0.01158491 0.9215686
## lift count
## [1] 71.08636 42
## [2] 71.43177 44
## [3] 74.95965 49
## [4] 70.54347 47
# Create the union of the rules and inspect
rules_rosemary <- arules::union(rules_rosemary_rhs, rules_rosemary_lhs)
inspect(rules_rosemary)
## lhs rhs support confidence lift count
## [1] {HERB MARKER BASIL} => {HERB MARKER ROSEMARY} 0.01035248 0.8936170 71.08636 42
## [2] {HERB MARKER PARSLEY} => {HERB MARKER ROSEMARY} 0.01084545 0.8979592 71.43177 44
## [3] {HERB MARKER THYME} => {HERB MARKER ROSEMARY} 0.01207789 0.9423077 74.95965 49
## [4] {HERB MARKER MINT} => {HERB MARKER ROSEMARY} 0.01158491 0.8867925 70.54347 47
## [5] {HERB MARKER PARSLEY,
## HERB MARKER THYME} => {HERB MARKER ROSEMARY} 0.01059896 0.9555556 76.01351 43
## [6] {HERB MARKER MINT,
## HERB MARKER PARSLEY} => {HERB MARKER ROSEMARY} 0.01010599 0.9318182 74.12522 41
## [7] {HERB MARKER MINT,
## HERB MARKER THYME} => {HERB MARKER ROSEMARY} 0.01109194 0.9574468 76.16395 45
## [8] {HERB MARKER ROSEMARY} => {HERB MARKER BASIL} 0.01035248 0.8235294 71.08636 42
## [9] {HERB MARKER ROSEMARY} => {HERB MARKER PARSLEY} 0.01084545 0.8627451 71.43177 44
## [10] {HERB MARKER ROSEMARY} => {HERB MARKER THYME} 0.01207789 0.9607843 74.95965 49
## [11] {HERB MARKER ROSEMARY} => {HERB MARKER MINT} 0.01158491 0.9215686 70.54347 47
# Apply the apriori function to the Online retail dataset
rules_online <- apriori(Online_trx, parameter = list(supp = 0.01, conf = 0.8, minlen = 2))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.8 0.1 1 none FALSE TRUE 5 0.01 2
## maxlen target ext
## 10 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 40
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[2662 item(s), 4057 transaction(s)] done [0.04s].
## sorting and recoding items ... [524 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [52 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
# Inspect the first 5 rules
inspect(head(rules_online, 5))
## lhs rhs support confidence
## [1] {HERB MARKER BASIL} => {HERB MARKER ROSEMARY} 0.01035248 0.8936170
## [2] {HERB MARKER ROSEMARY} => {HERB MARKER BASIL} 0.01035248 0.8235294
## [3] {HERB MARKER BASIL} => {HERB MARKER THYME} 0.01059896 0.9148936
## [4] {HERB MARKER THYME} => {HERB MARKER BASIL} 0.01059896 0.8269231
## [5] {HERB MARKER BASIL} => {HERB MARKER MINT} 0.01059896 0.9148936
## lift count
## [1] 71.08636 42
## [2] 71.08636 42
## [3] 71.37930 43
## [4] 71.37930 43
## [5] 70.03252 43
# Inspect the first 5 rules with highest lift
inspect(head(sort(rules_online, by="lift"), 5))
## lhs rhs support confidence lift count
## [1] {HERB MARKER PARSLEY,
## HERB MARKER ROSEMARY} => {HERB MARKER THYME} 0.01059896 0.9772727 76.24607 43
## [2] {HERB MARKER MINT,
## HERB MARKER THYME} => {HERB MARKER ROSEMARY} 0.01109194 0.9574468 76.16395 45
## [3] {HERB MARKER PARSLEY,
## HERB MARKER THYME} => {HERB MARKER ROSEMARY} 0.01059896 0.9555556 76.01351 43
## [4] {HERB MARKER ROSEMARY} => {HERB MARKER THYME} 0.01207789 0.9607843 74.95965 49
## [5] {HERB MARKER THYME} => {HERB MARKER ROSEMARY} 0.01207789 0.9423077 74.95965 49
# Transform the rules back to a dataframe
rules_online_df <- as(rules_online, "data.frame")
# Check the first records
head(rules_online_df)
## rules support confidence lift
## 1 {HERB MARKER BASIL} => {HERB MARKER ROSEMARY} 0.01035248 0.8936170 71.08636
## 2 {HERB MARKER ROSEMARY} => {HERB MARKER BASIL} 0.01035248 0.8235294 71.08636
## 3 {HERB MARKER BASIL} => {HERB MARKER THYME} 0.01059896 0.9148936 71.37930
## 4 {HERB MARKER THYME} => {HERB MARKER BASIL} 0.01059896 0.8269231 71.37930
## 5 {HERB MARKER BASIL} => {HERB MARKER MINT} 0.01059896 0.9148936 70.03252
## 6 {HERB MARKER MINT} => {HERB MARKER BASIL} 0.01059896 0.8113208 70.03252
## count
## 1 42
## 2 42
## 3 43
## 4 43
## 5 43
## 6 43
# Apply the apriori function to the Online retail dataset
rules_online <- apriori(Online_trx, parameter = list(supp = 0.01, conf = 0.8, minlen = 2))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.8 0.1 1 none FALSE TRUE 5 0.01 2
## maxlen target ext
## 10 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 40
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[2662 item(s), 4057 transaction(s)] done [0.04s].
## sorting and recoding items ... [524 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [52 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
# Inspect the first rules
inspect(head(rules_online))
## lhs rhs support confidence
## [1] {HERB MARKER BASIL} => {HERB MARKER ROSEMARY} 0.01035248 0.8936170
## [2] {HERB MARKER ROSEMARY} => {HERB MARKER BASIL} 0.01035248 0.8235294
## [3] {HERB MARKER BASIL} => {HERB MARKER THYME} 0.01059896 0.9148936
## [4] {HERB MARKER THYME} => {HERB MARKER BASIL} 0.01059896 0.8269231
## [5] {HERB MARKER BASIL} => {HERB MARKER MINT} 0.01059896 0.9148936
## [6] {HERB MARKER MINT} => {HERB MARKER BASIL} 0.01059896 0.8113208
## lift count
## [1] 71.08636 42
## [2] 71.08636 42
## [3] 71.37930 43
## [4] 71.37930 43
## [5] 70.03252 43
## [6] 70.03252 43
# Support of herb markers
supp_herb_markers <- apriori(Online_trx, parameter = list(target = "frequent itemsets", supp = 0.01),
appearance = list(items = c("HERB MARKER THYME", "HERB MARKER ROSEMARY"))
)
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## NA 0.1 1 none FALSE TRUE 5 0.01 1
## maxlen target ext
## 10 frequent itemsets FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 40
##
## set item appearances ...[2 item(s)] done [0.00s].
## set transactions ...[2 item(s), 4057 transaction(s)] done [0.03s].
## sorting and recoding items ... [2 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 done [0.00s].
## writing ... [3 set(s)] done [0.00s].
## creating S4 object ... done [0.00s].
# Inspect frequent itemsets
inspect(supp_herb_markers)
## items support count
## [1] {HERB MARKER ROSEMARY} 0.01257087 51
## [2] {HERB MARKER THYME} 0.01281735 52
## [3] {HERB MARKER ROSEMARY,HERB MARKER THYME} 0.01207789 49
# Extract rules for HERB MARKER THYME on rhs of rule
rules_thyme_marker_rhs <- apriori(Online_trx, parameter = list(supp=0.01, conf=0.8, minlen=2),
appearance = list(rhs = "HERB MARKER THYME"), control = list(verbose=F)
)
# Inspect rules
inspect(rules_thyme_marker_rhs)
## lhs rhs support confidence lift count
## [1] {HERB MARKER BASIL} => {HERB MARKER THYME} 0.01059896 0.9148936 71.37930 43
## [2] {HERB MARKER PARSLEY} => {HERB MARKER THYME} 0.01109194 0.9183673 71.65031 45
## [3] {HERB MARKER ROSEMARY} => {HERB MARKER THYME} 0.01207789 0.9607843 74.95965 49
## [4] {HERB MARKER MINT} => {HERB MARKER THYME} 0.01158491 0.8867925 69.18687 47
## [5] {HERB MARKER PARSLEY,
## HERB MARKER ROSEMARY} => {HERB MARKER THYME} 0.01059896 0.9772727 76.24607 43
## [6] {HERB MARKER MINT,
## HERB MARKER PARSLEY} => {HERB MARKER THYME} 0.01010599 0.9318182 72.69974 41
## [7] {HERB MARKER MINT,
## HERB MARKER ROSEMARY} => {HERB MARKER THYME} 0.01109194 0.9574468 74.69926 45
# Extract rules for HERB MARKER THYME on lhs of rule
rules_thyme_marker_lhs <- apriori(Online_trx, parameter = list(supp=0.01, conf=0.8, minlen=2),
appearance = list(lhs = "HERB MARKER THYME"), control = list (verbose=F)
)
# Inspect rules
inspect(rules_thyme_marker_lhs)
## lhs rhs support confidence
## [1] {HERB MARKER THYME} => {HERB MARKER BASIL} 0.01059896 0.8269231
## [2] {HERB MARKER THYME} => {HERB MARKER PARSLEY} 0.01109194 0.8653846
## [3] {HERB MARKER THYME} => {HERB MARKER ROSEMARY} 0.01207789 0.9423077
## [4] {HERB MARKER THYME} => {HERB MARKER MINT} 0.01158491 0.9038462
## lift count
## [1] 71.37930 43
## [2] 71.65031 45
## [3] 74.95965 49
## [4] 69.18687 47
# Apply the apriori function to the Online retail dataset
rules <- apriori(Online_trx, parameter = list(supp = 0.01, conf = 0.8, minlen = 2))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.8 0.1 1 none FALSE TRUE 5 0.01 2
## maxlen target ext
## 10 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 40
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[2662 item(s), 4057 transaction(s)] done [0.04s].
## sorting and recoding items ... [524 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [52 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
# Inspect the first 5 rules
inspect(head(rules))
## lhs rhs support confidence
## [1] {HERB MARKER BASIL} => {HERB MARKER ROSEMARY} 0.01035248 0.8936170
## [2] {HERB MARKER ROSEMARY} => {HERB MARKER BASIL} 0.01035248 0.8235294
## [3] {HERB MARKER BASIL} => {HERB MARKER THYME} 0.01059896 0.9148936
## [4] {HERB MARKER THYME} => {HERB MARKER BASIL} 0.01059896 0.8269231
## [5] {HERB MARKER BASIL} => {HERB MARKER MINT} 0.01059896 0.9148936
## [6] {HERB MARKER MINT} => {HERB MARKER BASIL} 0.01059896 0.8113208
## lift count
## [1] 71.08636 42
## [2] 71.08636 42
## [3] 71.37930 43
## [4] 71.37930 43
## [5] 70.03252 43
## [6] 70.03252 43
# Find out redundant of rules
redundant_rules <- is.redundant(rules)
# Inspect the non redundant rules
non_redundant_rules <- rules[!redundant_rules]
inspect(head(non_redundant_rules))
## lhs rhs support confidence
## [1] {HERB MARKER BASIL} => {HERB MARKER ROSEMARY} 0.01035248 0.8936170
## [2] {HERB MARKER ROSEMARY} => {HERB MARKER BASIL} 0.01035248 0.8235294
## [3] {HERB MARKER BASIL} => {HERB MARKER THYME} 0.01059896 0.9148936
## [4] {HERB MARKER THYME} => {HERB MARKER BASIL} 0.01059896 0.8269231
## [5] {HERB MARKER BASIL} => {HERB MARKER MINT} 0.01059896 0.9148936
## [6] {HERB MARKER MINT} => {HERB MARKER BASIL} 0.01059896 0.8113208
## lift count
## [1] 71.08636 42
## [2] 71.08636 42
## [3] 71.37930 43
## [4] 71.37930 43
## [5] 70.03252 43
## [6] 70.03252 43
Chapter 3 - Visualization in Market Basket Analysis
Items in the Basket:
Visualizing Metrics:
Rules to Graph-Based Visualizations:
Alternative Rule Plots:
Example code includes:
# Display items horizontally
itemFrequencyPlot(Online_trx, topN = 5, horiz = TRUE)
# Changing the font of the items
itemFrequencyPlot(Online_trx, topN = 10, col = rainbow(10), type = "relative", horiz = TRUE,
main = "Relative Item Frequency Plot" ,xlab = "Frequency", cex.names = 0.8
)
library(arulesViz)
## Loading required package: grid
## Registered S3 method overwritten by 'seriation':
## method from
## reorder.hclust gclus
# Inspection of the rules
inspectDT(rules_online)
# Create a standard scatterplot
plot(rules_online)
## To reduce overplotting, jitter is added! Use jitter = 0 to prevent jitter.
# Change the axis and legend of the scatterplot
plot(rules_online, measure = c("confidence", "lift"), shading = "support")
## To reduce overplotting, jitter is added! Use jitter = 0 to prevent jitter.
# Plot a two-key plot
plot(rules_online, method = "two-key plot")
## To reduce overplotting, jitter is added! Use jitter = 0 to prevent jitter.
# Plot a matrix plot
plot(rules_online, method = "matrix")
## Itemsets in Antecedent (LHS)
## [1] "{HERB MARKER MINT,HERB MARKER THYME}"
## [2] "{HERB MARKER PARSLEY,HERB MARKER ROSEMARY}"
## [3] "{HERB MARKER MINT,HERB MARKER ROSEMARY}"
## [4] "{HERB MARKER MINT,HERB MARKER PARSLEY}"
## [5] "{HERB MARKER PARSLEY,HERB MARKER THYME}"
## [6] "{HERB MARKER ROSEMARY}"
## [7] "{HERB MARKER THYME}"
## [8] "{HERB MARKER ROSEMARY,HERB MARKER THYME}"
## [9] "{HERB MARKER BASIL}"
## [10] "{HERB MARKER PARSLEY}"
## [11] "{HERB MARKER MINT}"
## [12] "{CHILDS GARDEN TROWEL PINK}"
## [13] "{CHILDS GARDEN TROWEL BLUE}"
## [14] "{SET/10 BLUE POLKADOT PARTY CANDLES}"
## [15] "{POPPY'S PLAYHOUSE LIVINGROOM}"
## [16] "{COFFEE MUG PEARS DESIGN}"
## [17] "{SET/20 RED RETROSPOT PAPER NAPKINS,SET/6 RED SPOTTY PAPER PLATES}"
## [18] "{SET/20 RED RETROSPOT PAPER NAPKINS,SET/6 RED SPOTTY PAPER CUPS}"
## [19] "{SET/6 RED SPOTTY PAPER CUPS}"
## [20] "{BLUE FELT EASTER EGG BASKET}"
## [21] "{KITCHEN METAL SIGN}"
## [22] "{PINK REGENCY TEACUP AND SAUCER,REGENCY CAKESTAND 3 TIER,ROSES REGENCY TEACUP AND SAUCER}"
## [23] "{PINK REGENCY TEACUP AND SAUCER,ROSES REGENCY TEACUP AND SAUCER}"
## [24] "{PINK REGENCY TEACUP AND SAUCER,REGENCY CAKESTAND 3 TIER}"
## [25] "{GREEN REGENCY TEACUP AND SAUCER,PINK REGENCY TEACUP AND SAUCER,REGENCY CAKESTAND 3 TIER}"
## [26] "{REGENCY CAKESTAND 3 TIER,ROSES REGENCY TEACUP AND SAUCER}"
## [27] "{GREEN REGENCY TEACUP AND SAUCER,PINK REGENCY TEACUP AND SAUCER}"
## [28] "{GREEN REGENCY TEACUP AND SAUCER,REGENCY CAKESTAND 3 TIER}"
## [29] "{SET OF 3 CAKE TINS PANTRY DESIGN,SET OF 6 HERB TINS SKETCHBOOK}"
## [30] "{JUMBO BAG BAROQUE BLACK WHITE,JUMBO STORAGE BAG SUKI}"
## [31] "{CANDLEHOLDER PINK HANGING HEART,RED HANGING HEART T-LIGHT HOLDER}"
## Itemsets in Consequent (RHS)
## [1] "{WHITE HANGING HEART T-LIGHT HOLDER}"
## [2] "{JUMBO BAG RED RETROSPOT}"
## [3] "{SET OF 6 SPICE TINS PANTRY DESIGN}"
## [4] "{ROSES REGENCY TEACUP AND SAUCER}"
## [5] "{GREEN REGENCY TEACUP AND SAUCER}"
## [6] "{BATHROOM METAL SIGN}"
## [7] "{CREAM FELT EASTER EGG BASKET}"
## [8] "{SET/6 RED SPOTTY PAPER PLATES}"
## [9] "{POPPY'S PLAYHOUSE KITCHEN}"
## [10] "{SET/6 RED SPOTTY PAPER CUPS}"
## [11] "{COFFEE MUG APPLES DESIGN}"
## [12] "{SET/10 PINK POLKADOT PARTY CANDLES}"
## [13] "{POPPY'S PLAYHOUSE BEDROOM}"
## [14] "{CHILDS GARDEN TROWEL PINK}"
## [15] "{CHILDS GARDEN TROWEL BLUE}"
## [16] "{HERB MARKER MINT}"
## [17] "{HERB MARKER BASIL}"
## [18] "{HERB MARKER PARSLEY}"
## [19] "{HERB MARKER THYME}"
## [20] "{HERB MARKER ROSEMARY}"
# Plot a matrix plot with confidence as color coding
plot(rules_online, method = "matrix", shading = "confidence")
## Itemsets in Antecedent (LHS)
## [1] "{HERB MARKER PARSLEY,HERB MARKER ROSEMARY}"
## [2] "{PINK REGENCY TEACUP AND SAUCER,REGENCY CAKESTAND 3 TIER,ROSES REGENCY TEACUP AND SAUCER}"
## [3] "{HERB MARKER PARSLEY,HERB MARKER THYME}"
## [4] "{HERB MARKER MINT,HERB MARKER PARSLEY}"
## [5] "{PINK REGENCY TEACUP AND SAUCER,ROSES REGENCY TEACUP AND SAUCER}"
## [6] "{SET/20 RED RETROSPOT PAPER NAPKINS,SET/6 RED SPOTTY PAPER CUPS}"
## [7] "{HERB MARKER MINT,HERB MARKER ROSEMARY}"
## [8] "{HERB MARKER MINT,HERB MARKER THYME}"
## [9] "{HERB MARKER BASIL}"
## [10] "{HERB MARKER PARSLEY}"
## [11] "{HERB MARKER ROSEMARY,HERB MARKER THYME}"
## [12] "{HERB MARKER ROSEMARY}"
## [13] "{SET/6 RED SPOTTY PAPER CUPS}"
## [14] "{HERB MARKER THYME}"
## [15] "{GREEN REGENCY TEACUP AND SAUCER,PINK REGENCY TEACUP AND SAUCER,REGENCY CAKESTAND 3 TIER}"
## [16] "{CANDLEHOLDER PINK HANGING HEART,RED HANGING HEART T-LIGHT HOLDER}"
## [17] "{PINK REGENCY TEACUP AND SAUCER,REGENCY CAKESTAND 3 TIER}"
## [18] "{BLUE FELT EASTER EGG BASKET}"
## [19] "{SET/10 BLUE POLKADOT PARTY CANDLES}"
## [20] "{HERB MARKER MINT}"
## [21] "{GREEN REGENCY TEACUP AND SAUCER,PINK REGENCY TEACUP AND SAUCER}"
## [22] "{SET/20 RED RETROSPOT PAPER NAPKINS,SET/6 RED SPOTTY PAPER PLATES}"
## [23] "{CHILDS GARDEN TROWEL BLUE}"
## [24] "{REGENCY CAKESTAND 3 TIER,ROSES REGENCY TEACUP AND SAUCER}"
## [25] "{POPPY'S PLAYHOUSE LIVINGROOM}"
## [26] "{KITCHEN METAL SIGN}"
## [27] "{CHILDS GARDEN TROWEL PINK}"
## [28] "{GREEN REGENCY TEACUP AND SAUCER,REGENCY CAKESTAND 3 TIER}"
## [29] "{SET OF 3 CAKE TINS PANTRY DESIGN,SET OF 6 HERB TINS SKETCHBOOK}"
## [30] "{COFFEE MUG PEARS DESIGN}"
## [31] "{JUMBO BAG BAROQUE BLACK WHITE,JUMBO STORAGE BAG SUKI}"
## Itemsets in Consequent (RHS)
## [1] "{POPPY'S PLAYHOUSE KITCHEN}"
## [2] "{JUMBO BAG RED RETROSPOT}"
## [3] "{COFFEE MUG APPLES DESIGN}"
## [4] "{SET OF 6 SPICE TINS PANTRY DESIGN}"
## [5] "{CHILDS GARDEN TROWEL BLUE}"
## [6] "{BATHROOM METAL SIGN}"
## [7] "{HERB MARKER BASIL}"
## [8] "{CHILDS GARDEN TROWEL PINK}"
## [9] "{SET/6 RED SPOTTY PAPER CUPS}"
## [10] "{POPPY'S PLAYHOUSE BEDROOM}"
## [11] "{ROSES REGENCY TEACUP AND SAUCER}"
## [12] "{SET/10 PINK POLKADOT PARTY CANDLES}"
## [13] "{HERB MARKER PARSLEY}"
## [14] "{CREAM FELT EASTER EGG BASKET}"
## [15] "{WHITE HANGING HEART T-LIGHT HOLDER}"
## [16] "{GREEN REGENCY TEACUP AND SAUCER}"
## [17] "{SET/6 RED SPOTTY PAPER PLATES}"
## [18] "{HERB MARKER MINT}"
## [19] "{HERB MARKER ROSEMARY}"
## [20] "{HERB MARKER THYME}"
# Create a HTML widget of the graph of rules
plot(rules_online, method = "graph", engine = "htmlwidget")
# HTML widget graph for the highest confidence rules
plot(head(sort(rules_online, by="confidence"), 5), method = "graph", engine = "htmlwidget")
# HTML widget graph for rules with lowest lift
plot(tail(sort(rules_online, by="lift"), 5), method = "graph", engine = "htmlwidget")
# Create an interactive graph visualization
rules_html <- plot(rules_online, method = "graph", engine = "htmlwidget")
# Save the interactive graph as an html file
# htmlwidgets::saveWidget(rules_html, file = "./RInputFiles/rules_grocery.html")
# Plot a group matrix-based visualization
# plot(subset_rules, method = "grouped")
# Change the arguments of group matrix-based visualization
# plot(subset_rules, method = "grouped", measure = "lift", shading = "confidence")
# Plotting the parallel coordinate plots
plot(rules_online, method = "paracoord")
# Parallel coordinate plots with confidence as color coding
plot(rules_online, method = "paracoord", shading = "confidence")
Chapter 4 - Case Study: Market Basket with Movies
Recap on Transactions:
Mining Association Rules:
Visualizing Transactions and Rules:
Making the most of Market Basket Analysis:
Wrap Up:
Example code includes:
# Have a glimpse at the dataset
movie_subset %>%
glimpse()
## Observations: 19,455
## Variables: 6
## $ X1 <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, ...
## $ userId <dbl> 1323, 1323, 1323, 1323, 1323, 1323, 1323, 1323, 1323, 1323,...
## $ movieId <dbl> 1, 3, 5, 10, 11, 12, 15, 16, 17, 19, 21, 22, 23, 29, 31, 34...
## $ title <chr> "Toy Story", "Grumpier Old Men", "Father of the Bride Part ...
## $ year <dbl> 1995, 1995, 1995, 1995, 1995, 1995, 1995, 1995, 1995, 1995,...
## $ genres <chr> "Adventure|Animation|Children|Comedy|Fantasy", "Comedy|Roma...
# Calculate the number of distinct users and movies
n_distinct(movie_subset$userId)
## [1] 100
n_distinct(movie_subset$movieId)
## [1] 4598
# Distribution of the number of movies watched by users
movie_subset %>%
group_by(userId) %>%
summarize(nb_movies = n_distinct(movieId)) %>%
ggplot(aes(x=nb_movies)) +
geom_histogram() +
ggtitle("Distribution of number of movies watched")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Split dataset into movies and users
data_list <- split(movie_subset$title, movie_subset$userId)
# Transform data into a transactional dataset
movie_trx <- as(data_list, "transactions")
## Warning in asMethod(object): removing duplicated items in transactions
# Plot of the item matrix
image(movie_trx[1:100,1:100])
# Setting the plot configuration option
par(mfrow=c(2, 1))
# Plot the relative and absolute item frequency plot
itemFrequencyPlot(movie_trx, type = "relative", topN = 10, horiz = TRUE, main = 'Relative item frequency')
itemFrequencyPlot(movie_trx, type = "absolute", topN = 10, horiz = TRUE, main = 'Absolute item frequency')
par(mfrow=c(1, 1))
# Extract the set of most frequent itemsets
itemsets <- apriori(movie_trx, parameter = list(support = 0.4, target = 'frequent itemsets'))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## NA 0.1 1 none FALSE TRUE 5 0.4 1
## maxlen target ext
## 10 frequent itemsets FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 40
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[4508 item(s), 100 transaction(s)] done [0.01s].
## sorting and recoding items ... [15 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 done [0.00s].
## writing ... [16 set(s)] done [0.00s].
## creating S4 object ... done [0.00s].
# Inspect the five most popular items
arules::inspect(sort(itemsets, by='support', decreasing = TRUE)[1:5])
## items support count
## [1] {Matrix, The} 0.60 60
## [2] {American Beauty} 0.57 57
## [3] {Fight Club} 0.54 54
## [4] {Silence of the Lambs, The} 0.50 50
## [5] {Shawshank Redemption, The} 0.48 48
# Extract the set of most frequent itemsets
itemsets_minlen2 <- apriori(movie_trx, parameter = list(support = 0.3, minlen = 2, target = 'frequent'))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## NA 0.1 1 none FALSE TRUE 5 0.3 2
## maxlen target ext
## 10 frequent itemsets FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 30
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[4508 item(s), 100 transaction(s)] done [0.01s].
## sorting and recoding items ... [56 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [115 set(s)] done [0.00s].
## creating S4 object ... done [0.00s].
# Inspect the five most popular items
arules::inspect(sort(itemsets_minlen2, by='support', decreasing = TRUE)[1:5])
## items support count
## [1] {Matrix, The,
## Silence of the Lambs, The} 0.40 40
## [2] {Lord of the Rings: The Fellowship of the Ring, The,
## Lord of the Rings: The Two Towers, The} 0.38 38
## [3] {American Beauty,
## Pulp Fiction} 0.38 38
## [4] {Pulp Fiction,
## Silence of the Lambs, The} 0.38 38
## [5] {Matrix, The,
## Star Wars: Episode IV - A New Hope} 0.38 38
# Set of confidence levels
confidenceLevels <- seq(from=0.95, to=0.5, by=-0.05)
# Create empty vector
rules_sup04 <- NULL
rules_sup03 <- NULL
# Apriori algorithm with a support level of 40% and 30%
for (i in 1:length(confidenceLevels)) {
rules_sup04[i] = length(apriori(movie_trx,
parameter=list(sup=0.4, conf=confidenceLevels[i], target="rules")
)
)
rules_sup03[i] = length(apriori(movie_trx,
parameter=list(sup=0.3, conf=confidenceLevels[i], target="rules")
)
)
}
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.95 0.1 1 none FALSE TRUE 5 0.4 1
## maxlen target ext
## 10 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 40
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[4508 item(s), 100 transaction(s)] done [0.01s].
## sorting and recoding items ... [15 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 done [0.00s].
## writing ... [0 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.95 0.1 1 none FALSE TRUE 5 0.3 1
## maxlen target ext
## 10 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 30
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[4508 item(s), 100 transaction(s)] done [0.01s].
## sorting and recoding items ... [56 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [10 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.9 0.1 1 none FALSE TRUE 5 0.4 1
## maxlen target ext
## 10 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 40
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[4508 item(s), 100 transaction(s)] done [0.01s].
## sorting and recoding items ... [15 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 done [0.00s].
## writing ... [0 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.9 0.1 1 none FALSE TRUE 5 0.3 1
## maxlen target ext
## 10 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 30
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[4508 item(s), 100 transaction(s)] done [0.01s].
## sorting and recoding items ... [56 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [26 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.85 0.1 1 none FALSE TRUE 5 0.4 1
## maxlen target ext
## 10 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 40
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[4508 item(s), 100 transaction(s)] done [0.02s].
## sorting and recoding items ... [15 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 done [0.00s].
## writing ... [0 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.85 0.1 1 none FALSE TRUE 5 0.3 1
## maxlen target ext
## 10 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 30
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[4508 item(s), 100 transaction(s)] done [0.01s].
## sorting and recoding items ... [56 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [52 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.8 0.1 1 none FALSE TRUE 5 0.4 1
## maxlen target ext
## 10 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 40
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[4508 item(s), 100 transaction(s)] done [0.02s].
## sorting and recoding items ... [15 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 done [0.00s].
## writing ... [1 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.8 0.1 1 none FALSE TRUE 5 0.3 1
## maxlen target ext
## 10 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 30
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[4508 item(s), 100 transaction(s)] done [0.01s].
## sorting and recoding items ... [56 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [90 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.75 0.1 1 none FALSE TRUE 5 0.4 1
## maxlen target ext
## 10 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 40
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[4508 item(s), 100 transaction(s)] done [0.01s].
## sorting and recoding items ... [15 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 done [0.00s].
## writing ... [1 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.75 0.1 1 none FALSE TRUE 5 0.3 1
## maxlen target ext
## 10 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 30
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[4508 item(s), 100 transaction(s)] done [0.02s].
## sorting and recoding items ... [56 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [129 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.7 0.1 1 none FALSE TRUE 5 0.4 1
## maxlen target ext
## 10 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 40
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[4508 item(s), 100 transaction(s)] done [0.01s].
## sorting and recoding items ... [15 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 done [0.00s].
## writing ... [1 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.7 0.1 1 none FALSE TRUE 5 0.3 1
## maxlen target ext
## 10 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 30
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[4508 item(s), 100 transaction(s)] done [0.01s].
## sorting and recoding items ... [56 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [162 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.65 0.1 1 none FALSE TRUE 5 0.4 1
## maxlen target ext
## 10 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 40
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[4508 item(s), 100 transaction(s)] done [0.02s].
## sorting and recoding items ... [15 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 done [0.00s].
## writing ... [2 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.65 0.1 1 none FALSE TRUE 5 0.3 1
## maxlen target ext
## 10 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 30
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[4508 item(s), 100 transaction(s)] done [0.01s].
## sorting and recoding items ... [56 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [194 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.6 0.1 1 none FALSE TRUE 5 0.4 1
## maxlen target ext
## 10 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 40
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[4508 item(s), 100 transaction(s)] done [0.01s].
## sorting and recoding items ... [15 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 done [0.00s].
## writing ... [3 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.6 0.1 1 none FALSE TRUE 5 0.3 1
## maxlen target ext
## 10 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 30
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[4508 item(s), 100 transaction(s)] done [0.05s].
## sorting and recoding items ... [56 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [220 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.55 0.1 1 none FALSE TRUE 5 0.4 1
## maxlen target ext
## 10 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 40
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[4508 item(s), 100 transaction(s)] done [0.04s].
## sorting and recoding items ... [15 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 done [0.00s].
## writing ... [4 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.55 0.1 1 none FALSE TRUE 5 0.3 1
## maxlen target ext
## 10 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 30
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[4508 item(s), 100 transaction(s)] done [0.04s].
## sorting and recoding items ... [56 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [238 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.5 0.1 1 none FALSE TRUE 5 0.4 1
## maxlen target ext
## 10 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 40
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[4508 item(s), 100 transaction(s)] done [0.03s].
## sorting and recoding items ... [15 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 done [0.00s].
## writing ... [6 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.5 0.1 1 none FALSE TRUE 5 0.3 1
## maxlen target ext
## 10 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 30
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[4508 item(s), 100 transaction(s)] done [0.01s].
## sorting and recoding items ... [56 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [254 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
# Number of rules found with a support level of 40%
qplot(confidenceLevels, rules_sup04, geom=c("point", "line"), xlab="Confidence level",
ylab="Number of rules found",main="Apriori with a support level of 40%"
) +
theme_bw()
# Create Data frame containing all results
nb_rules <- data.frame(rules_sup04, rules_sup03, confidenceLevels)
# Number of rules found with a support level of 40% and 30%
ggplot(data=nb_rules, aes(x=confidenceLevels)) +
# Lines and points for rules_sup04
geom_line(aes(y=rules_sup04, colour="Support level of 40%")) +
geom_point(aes(y=rules_sup04, colour="Support level of 40%")) +
# Lines and points for rules_sup03
geom_line(aes(y=rules_sup03, colour="Support level of 30%")) +
geom_point(aes(y=rules_sup03, colour="Support level of 30%")) +
# Polishing the graph
theme_bw() + ylab("") +
ggtitle("Number of extracted rules with apriori")
# Extract rules with the apriori
rules_movies <- apriori(movie_trx, parameter = list(supp = 0.3, conf = 0.9, minlen = 2, target = "rules"))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.9 0.1 1 none FALSE TRUE 5 0.3 2
## maxlen target ext
## 10 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 30
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[4508 item(s), 100 transaction(s)] done [0.01s].
## sorting and recoding items ... [56 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [26 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
# Summary of extracted rules
summary(rules_movies)
## set of 26 rules
##
## rule length distribution (lhs + rhs):sizes
## 2 3 4
## 8 15 3
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.000 2.000 3.000 2.808 3.000 4.000
##
## summary of quality measures:
## support confidence lift count
## Min. :0.3000 Min. :0.9091 Min. :1.515 Min. :30.00
## 1st Qu.:0.3100 1st Qu.:0.9216 1st Qu.:1.568 1st Qu.:31.00
## Median :0.3400 Median :0.9394 Median :2.191 Median :34.00
## Mean :0.3315 Mean :0.9536 Mean :2.095 Mean :33.15
## 3rd Qu.:0.3500 3rd Qu.:1.0000 3rd Qu.:2.558 3rd Qu.:35.00
## Max. :0.3800 Max. :1.0000 Max. :2.632 Max. :38.00
##
## mining info:
## data ntransactions support confidence
## movie_trx 100 0.3 0.9
# Create redudant rules and filter from extracted rules
rules_red <- is.redundant(rules_movies)
rules.pruned <- rules_movies[!rules_red]
# Inspect the non-redundant rules with highest confidence
arules::inspect(head(sort(rules.pruned, by="confidence")))
## lhs rhs support confidence lift count
## [1] {Lord of the Rings: The Two Towers, The} => {Lord of the Rings: The Fellowship of the Ring, The} 0.38 1.0000000 2.222222 38
## [2] {Lord of the Rings: The Fellowship of the Ring, The,
## Lord of the Rings: The Return of the King, The} => {Lord of the Rings: The Two Towers, The} 0.35 1.0000000 2.631579 35
## [3] {Lord of the Rings: The Return of the King, The,
## Matrix, The} => {Lord of the Rings: The Two Towers, The} 0.31 1.0000000 2.631579 31
## [4] {Lord of the Rings: The Return of the King, The,
## Matrix, The} => {Lord of the Rings: The Fellowship of the Ring, The} 0.31 1.0000000 2.222222 31
## [5] {Lord of the Rings: The Return of the King, The} => {Lord of the Rings: The Two Towers, The} 0.35 0.9722222 2.558480 35
## [6] {Lord of the Rings: The Return of the King, The} => {Lord of the Rings: The Fellowship of the Ring, The} 0.35 0.9722222 2.160494 35
# Plot rules as scatterplot
plot(rules_movies, measure = c("confidence", "lift"), shading = "support", jitter = 1, engine = "html")
# Interactive matrix-based plot
plot(rules_movies, method = "matrix", shading ="confidence", engine = "html")
# Grouped matrix plot of rules
plot(rules_movies, method = "grouped", measure = "lift", shading = "confidence")
# Parallel coordinate plots with confidence as color coding
plot(rules_movies, method = "paracoord", shading = "confidence")
# Plot movie rules as a graph
plot(rules_movies, method = "graph", engine = "htmlwidget")
# Retrieve the top 10 rules with highest confidence
top10_rules_movies = head(sort(rules_movies, by = "confidence"), 10)
# Plot as an interactive graph the top 10 rules
plot(top10_rules_movies, method = "graph", engine = "htmlwidget")
# Extract rules with Pulp Fiction on the right side
pulpfiction_rules_rhs <- apriori(movie_trx, parameter = list(supp = 0.3, conf = 0.5),
appearance = list(default = "lhs", rhs = "Pulp Fiction")
)
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.5 0.1 1 none FALSE TRUE 5 0.3 1
## maxlen target ext
## 10 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 30
##
## set item appearances ...[1 item(s)] done [0.00s].
## set transactions ...[4508 item(s), 100 transaction(s)] done [0.02s].
## sorting and recoding items ... [56 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [19 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
# Inspect the first rules
arules::inspect(head(pulpfiction_rules_rhs))
## lhs rhs support confidence lift count
## [1] {Schindler's List} => {Pulp Fiction} 0.30 0.6818182 1.450677 30
## [2] {Jurassic Park} => {Pulp Fiction} 0.31 0.7209302 1.533894 31
## [3] {Seven (a.k.a. Se7en)} => {Pulp Fiction} 0.30 0.8108108 1.725129 30
## [4] {Lord of the Rings: The Fellowship of the Ring, The} => {Pulp Fiction} 0.31 0.6888889 1.465721 31
## [5] {Sixth Sense, The} => {Pulp Fiction} 0.31 0.7045455 1.499033 31
## [6] {Forrest Gump} => {Pulp Fiction} 0.33 0.7857143 1.671733 33
arules::inspect(head(sort(pulpfiction_rules_rhs, by="lift"), 10))
## lhs rhs support confidence lift count
## [1] {Fight Club,
## Silence of the Lambs, The} => {Pulp Fiction} 0.34 0.9189189 1.955147 34
## [2] {American Beauty,
## Silence of the Lambs, The} => {Pulp Fiction} 0.31 0.8857143 1.884498 31
## [3] {Shawshank Redemption, The,
## Silence of the Lambs, The} => {Pulp Fiction} 0.31 0.8857143 1.884498 31
## [4] {Fight Club,
## Matrix, The} => {Pulp Fiction} 0.30 0.8333333 1.773050 30
## [5] {Seven (a.k.a. Se7en)} => {Pulp Fiction} 0.30 0.8108108 1.725129 30
## [6] {American Beauty,
## Matrix, The} => {Pulp Fiction} 0.30 0.8108108 1.725129 30
## [7] {Matrix, The,
## Silence of the Lambs, The} => {Pulp Fiction} 0.32 0.8000000 1.702128 32
## [8] {American Beauty,
## Fight Club} => {Pulp Fiction} 0.30 0.7894737 1.679731 30
## [9] {Forrest Gump} => {Pulp Fiction} 0.33 0.7857143 1.671733 33
## [10] {Silence of the Lambs, The} => {Pulp Fiction} 0.38 0.7600000 1.617021 38
# Extract rules with Pulp Fiction on the left side
pulpfiction_rules_lhs <- apriori(movie_trx, parameter = list(supp = 0.3, conf = 0.5),
appearance = list(default = "rhs", lhs = "Pulp Fiction")
)
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.5 0.1 1 none FALSE TRUE 5 0.3 1
## maxlen target ext
## 10 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 30
##
## set item appearances ...[1 item(s)] done [0.00s].
## set transactions ...[4508 item(s), 100 transaction(s)] done [0.01s].
## sorting and recoding items ... [56 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 done [0.00s].
## writing ... [16 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
# Summary of extracted rules
summary(pulpfiction_rules_lhs)
## set of 16 rules
##
## rule length distribution (lhs + rhs):sizes
## 1 2
## 4 12
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 1.75 2.00 1.75 2.00 2.00
##
## summary of quality measures:
## support confidence lift count
## Min. :0.3000 Min. :0.5000 Min. :1.000 Min. :30.00
## 1st Qu.:0.3100 1st Qu.:0.6287 1st Qu.:1.234 1st Qu.:31.00
## Median :0.3600 Median :0.6596 Median :1.454 Median :36.00
## Mean :0.3887 Mean :0.6714 Mean :1.385 Mean :38.88
## 3rd Qu.:0.4100 3rd Qu.:0.7553 3rd Qu.:1.538 3rd Qu.:41.00
## Max. :0.6000 Max. :0.8085 Max. :1.725 Max. :60.00
##
## mining info:
## data ntransactions support confidence
## movie_trx 100 0.3 0.5
# Inspect the first rules
arules::inspect(head(pulpfiction_rules_lhs))
## lhs rhs support confidence lift
## [1] {} => {American Beauty} 0.57 0.5700000 1.000000
## [2] {} => {Silence of the Lambs, The} 0.50 0.5000000 1.000000
## [3] {} => {Fight Club} 0.54 0.5400000 1.000000
## [4] {} => {Matrix, The} 0.60 0.6000000 1.000000
## [5] {Pulp Fiction} => {Schindler's List} 0.30 0.6382979 1.450677
## [6] {Pulp Fiction} => {Jurassic Park} 0.31 0.6595745 1.533894
## count
## [1] 57
## [2] 50
## [3] 54
## [4] 60
## [5] 30
## [6] 31
# Extract rules with Pulp Fiction on the left side
pulpfiction_rules_lhs <- apriori(movie_trx, parameter = list(supp = 0.3, conf = 0.5, minlen = 2),
appearance = list(default = "rhs", lhs = "Pulp Fiction")
)
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.5 0.1 1 none FALSE TRUE 5 0.3 2
## maxlen target ext
## 10 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 30
##
## set item appearances ...[1 item(s)] done [0.00s].
## set transactions ...[4508 item(s), 100 transaction(s)] done [0.02s].
## sorting and recoding items ... [56 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 done [0.00s].
## writing ... [12 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
# Inspect the first rules
arules::inspect(head(pulpfiction_rules_lhs))
## lhs rhs support confidence lift count
## [1] {Pulp Fiction} => {Schindler's List} 0.30 0.6382979 1.450677 30
## [2] {Pulp Fiction} => {Jurassic Park} 0.31 0.6595745 1.533894 31
## [3] {Pulp Fiction} => {Seven (a.k.a. Se7en)} 0.30 0.6382979 1.725129 30
## [4] {Pulp Fiction} => {Lord of the Rings: The Fellowship of the Ring, The} 0.31 0.6595745 1.465721 31
## [5] {Pulp Fiction} => {Sixth Sense, The} 0.31 0.6595745 1.499033 31
## [6] {Pulp Fiction} => {Forrest Gump} 0.33 0.7021277 1.671733 33
Chapter 1 - Get Started with Shiny
Introduction to Shiny:
Build a “Hello World” Shiny App:
Build a babynames explorer Shiny App:
Example code includes:
library(shiny)
ui <- fluidPage(
# CODE BELOW: Add a text input "name"
textInput("name", "Enter your name: ")
)
server <- function(input, output) {
}
shinyApp(ui = ui, server = server)
ui <- fluidPage(
textInput("name", "What is your name?"),
# CODE BELOW: Display the text output, greeting
textOutput("greeting")
)
server <- function(input, output) {
# CODE BELOW: Render a text output, greeting
output$greeting <- renderText({paste0("Hello, ", input$name)})
}
shinyApp(ui = ui, server = server)
ui <- fluidPage(
# CODE BELOW: Add a text input "name"
textInput("name", "Enter Your Name", "David")
)
server <- function(input, output, session) {
}
shinyApp(ui = ui, server = server)
ui <- fluidPage(
textInput('name', 'Enter Name', 'David'),
# CODE BELOW: Display the plot output named 'trend'
plotOutput("trend")
)
server <- function(input, output, session) {
# CODE BELOW: Render an empty plot and assign to output named 'trend'
output$trend <- renderPlot({ggplot()})
}
shinyApp(ui = ui, server = server)
ui <- fluidPage(
titlePanel("Baby Name Explorer"),
sidebarLayout(
sidebarPanel(textInput('name', 'Enter Name', 'David')),
mainPanel(plotOutput('trend'))
)
)
server <- function(input, output, session) {
output$trend <- renderPlot({
# CODE BELOW: Update to display a line plot of the input name
babynames::babynames %>%
filter(name==input$name) %>%
ggplot(aes(x=year, y=prop, color=sex)) +
geom_line()
})
}
shinyApp(ui = ui, server = server)
Chapter 2 - Inputs, Outputs, and Layouts
Inputs:
Outputs:
Layouts and Themes:
Building Apps:
Example code includes:
ui <- fluidPage(
titlePanel("What's in a Name?"),
# CODE BELOW: Add select input named "sex" to choose between "M" and "F"
selectInput("sex", "Select Sex", selected="F", choices=c("F", "M")),
sliderInput("year", "Select Year", value=1900, min=1900, max=2010),
# Add plot output to display top 10 most popular names
plotOutput('plot_top_10_names')
)
server <- function(input, output, session){
# Render plot of top 10 most popular names
output$plot_top_10_names <- renderPlot({
# Get top 10 names by sex and year
top_10_names <- babynames::babynames %>%
# MODIFY CODE BELOW: Filter for the selected sex
filter(sex == input$sex) %>%
filter(year == input$year) %>%
top_n(10, prop)
# Plot top 10 names by sex and year
ggplot(top_10_names, aes(x = name, y = prop)) +
geom_col(fill = "#263e63")
})
}
shinyApp(ui = ui, server = server)
ui <- fluidPage(
titlePanel("What's in a Name?"),
# Add select input named "sex" to choose between "M" and "F"
selectInput('sex', 'Select Sex', choices = c("F", "M")),
# Add slider input named "year" to select year between 1900 and 2010
sliderInput('year', 'Select Year', min = 1900, max = 2010, value = 1900),
# CODE BELOW: Add table output named "table_top_10_names"
tableOutput("table_top_10_names")
)
server <- function(input, output, session){
# Function to create a data frame of top 10 names by sex and year
top_10_names <- function(){
top_10_names <- babynames::babynames %>%
filter(sex == input$sex) %>%
filter(year == input$year) %>%
top_n(10, prop)
}
# CODE BELOW: Render a table output named "table_top_10_names"
output$table_top_10_names <- renderTable({top_10_names()})
}
shinyApp(ui = ui, server = server)
ui <- fluidPage(
titlePanel("What's in a Name?"),
# Add select input named "sex" to choose between "M" and "F"
selectInput('sex', 'Select Sex', choices = c("M", "F")),
# Add slider input named "year" to select year between 1900 and 2010
sliderInput('year', 'Select Year', min = 1900, max = 2010, value = 1900),
# MODIFY CODE BELOW: Add a DT output named "table_top_10_names"
DT::DTOutput('table_top_10_names')
)
server <- function(input, output, session){
top_10_names <- function(){
babynames::babynames %>%
filter(sex == input$sex) %>%
filter(year == input$year) %>%
top_n(10, prop)
}
# MODIFY CODE BELOW: Render a DT output named "table_top_10_names"
output$table_top_10_names <- DT::renderDT({
DT::datatable(top_10_names())
})
}
shinyApp(ui = ui, server = server)
top_trendy_names <- data.frame(name=c('Kizzy', 'Deneen', 'Royalty', 'Mareli', 'Moesha', 'Marely', 'Kanye', 'Tennille', 'Aitana', 'Kadijah', 'Shaquille', 'Catina', 'Allisson', 'Emberly', 'Nakia', 'Jaslene', 'Kyrie', 'Akeelah', 'Zayn', 'Talan'), stringsAsFactors=FALSE)
ui <- fluidPage(
selectInput('name', 'Select Name', top_trendy_names$name),
# CODE BELOW: Add a plotly output named 'plot_trendy_names'
plotly::plotlyOutput("plot_trendy_names")
)
server <- function(input, output, session){
# Function to plot trends in a name
plot_trends <- function(){
babynames::babynames %>%
filter(name == input$name) %>%
ggplot(aes(x = year, y = n)) +
geom_col()
}
# CODE BELOW: Render a plotly output named 'plot_trendy_names'
output$plot_trendy_names <- plotly::renderPlotly({plot_trends()})
}
shinyApp(ui = ui, server = server)
ui <- fluidPage(
# MODIFY CODE BELOW: Wrap in a sidebarLayout
sidebarLayout(
# MODIFY CODE BELOW: Wrap in a sidebarPanel
sidebarPanel(selectInput('name', 'Select Name', top_trendy_names$name)),
# MODIFY CODE BELOW: Wrap in a mainPanel
mainPanel(plotly::plotlyOutput('plot_trendy_names'), DT::DTOutput('table_trendy_names'))
)
)
# DO NOT MODIFY
server <- function(input, output, session){
# Function to plot trends in a name
plot_trends <- function(){
babynames::babynames %>%
filter(name == input$name) %>%
ggplot(aes(x = year, y = n)) +
geom_col()
}
output$plot_trendy_names <- plotly::renderPlotly({
plot_trends()
})
output$table_trendy_names <- DT::renderDT({
babynames::babynames %>%
filter(name == input$name)
})
}
shinyApp(ui = ui, server = server)
ui <- fluidPage(
sidebarLayout(
sidebarPanel(selectInput('name', 'Select Name', top_trendy_names$name)),
mainPanel(
# MODIFY CODE BLOCK BELOW: Wrap in a tabsetPanel
tabsetPanel(
# MODIFY CODE BELOW: Wrap in a tabPanel providing an appropriate label
tabPanel("Plot", plotly::plotlyOutput('plot_trendy_names')),
# MODIFY CODE BELOW: Wrap in a tabPanel providing an appropriate label
tabPanel("Table", DT::DTOutput('table_trendy_names'))
)
)
)
)
server <- function(input, output, session){
# Function to plot trends in a name
plot_trends <- function(){
babynames::babynames %>%
filter(name == input$name) %>%
ggplot(aes(x = year, y = n)) +
geom_col()
}
output$plot_trendy_names <- plotly::renderPlotly({
plot_trends()
})
output$table_trendy_names <- DT::renderDT({
babynames::babynames %>%
filter(name == input$name)
})
}
shinyApp(ui = ui, server = server)
ui <- fluidPage(
# CODE BELOW: Add a titlePanel with an appropriate title
titlePanel("Trendy Names"),
# REPLACE CODE BELOW: with theme = shinythemes::shinytheme("<your theme>")
theme = shinythemes::shinytheme("spacelab"),
sidebarLayout(
sidebarPanel(
selectInput('name', 'Select Name', top_trendy_names$name)
),
mainPanel(
tabsetPanel(
tabPanel('Plot', plotly::plotlyOutput('plot_trendy_names')),
tabPanel('Table', DT::DTOutput('table_trendy_names'))
)
)
)
)
server <- function(input, output, session){
# Function to plot trends in a name
plot_trends <- function(){
babynames::babynames %>%
filter(name == input$name) %>%
ggplot(aes(x = year, y = n)) +
geom_col()
}
output$plot_trendy_names <- plotly::renderPlotly({
plot_trends()
})
output$table_trendy_names <- DT::renderDT({
babynames::babynames %>%
filter(name == input$name)
})
}
shinyApp(ui = ui, server = server)
ui <- fluidPage(
selectInput("greeting", "Select Greeting", selected="Hello", choices=c("Hello", "Bonjour")),
textInput("name", "Enter Your Name"),
textOutput("greeting")
)
server <- function(input, output, session) {
output$greeting <- renderText({paste0(input$greeting, ", ", input$name)})
}
shinyApp(ui = ui, server = server)
get_top_names <- function(.year, .sex) {
babynames::babynames %>%
filter(year == .year) %>%
filter(sex == .sex) %>%
top_n(10) %>%
mutate(name = forcats::fct_inorder(name))
}
ui <- fluidPage(
titlePanel("Most Popular Names"),
sidebarLayout(
sidebarPanel(
selectInput("sex", "Select Sex", selected="M", choices=c("M", "F")),
sliderInput("year", "Select Year", value=1900, min=1880, max=2017)
),
mainPanel(
plotOutput("popular")
)
)
)
server <- function(input, output, session) {
output$popular <- renderPlot({ get_top_names(input$year, input$sex) %>%
ggplot(aes(x=name, y=prop)) +
geom_col()
})
}
shinyApp(ui = ui, server = server)
ui <- fluidPage(
titlePanel("Most Popular Names"),
sidebarLayout(
sidebarPanel(
selectInput("sex", "Select Sex", selected="M", choices=c("M", "F")),
sliderInput("year", "Select Year", value=1900, min=1880, max=2017)
),
mainPanel(
tabsetPanel(
tabPanel("Plot", plotOutput("popular")),
tabPanel("Table", DT::DTOutput("table"))
)
)
)
)
server <- function(input, output, session) {
output$popular <- renderPlot({ get_top_names(input$year, input$sex) %>%
ggplot(aes(x=name, y=prop)) +
geom_col()
})
output$table <- DT::renderDT({ get_top_names(input$year, input$sex) })
}
shinyApp(ui = ui, server = server)
Chapter 3 - Reactive Programming
Reactivity 101:
Observers vs Reactives:
Stop-Delay-Trigger:
Applying Reactivity Concepts:
Example code includes:
server <- function(input, output, session) {
# CODE BELOW: Add a reactive expression rval_bmi to calculate BMI
rval_bmi <- reactive({ input$weight/(input$height^2) })
output$bmi <- renderText({
# MODIFY CODE BELOW: Replace right-hand-side with reactive expression
bmi <- rval_bmi()
paste("Your BMI is", round(bmi, 1))
})
output$bmi_range <- renderText({
# MODIFY CODE BELOW: Replace right-hand-side with reactive expression
bmi <- rval_bmi()
bmi_status <- cut(bmi,
breaks = c(0, 18.5, 24.9, 29.9, 40),
labels = c('underweight', 'healthy', 'overweight', 'obese')
)
paste("You are", bmi_status)
})
}
ui <- fluidPage(
titlePanel('BMI Calculator'),
sidebarLayout(
sidebarPanel(
numericInput('height', 'Enter your height in meters', 1.5, 1, 2),
numericInput('weight', 'Enter your weight in Kilograms', 60, 45, 120)
),
mainPanel(
textOutput("bmi"),
textOutput("bmi_range")
)
)
)
shinyApp(ui = ui, server = server)
server <- function(input, output, session) {
rval_bmi <- reactive({
input$weight/(input$height^2)
})
# CODE BELOW: Add a reactive expression rval_bmi_status to
# return health status as underweight etc. based on inputs
rval_bmi_status <- reactive({
cut(rval_bmi(), breaks = c(0, 18.5, 24.9, 29.9, 40),
labels = c('underweight', 'healthy', 'overweight', 'obese')
)
})
output$bmi <- renderText({
bmi <- rval_bmi()
paste("Your BMI is", round(bmi, 1))
})
output$bmi_status <- renderText({
# MODIFY CODE BELOW: Replace right-hand-side with
# reactive expression rval_bmi_status
bmi_status <- rval_bmi_status()
paste("You are", bmi_status)
})
}
ui <- fluidPage(
titlePanel('BMI Calculator'),
sidebarLayout(
sidebarPanel(
numericInput('height', 'Enter your height in meters', 1.5, 1, 2),
numericInput('weight', 'Enter your weight in Kilograms', 60, 45, 120)
),
mainPanel(
textOutput("bmi"),
textOutput("bmi_status")
)
)
)
shinyApp(ui = ui, server = server)
ui <- fluidPage(
textInput('name', 'Enter your name')
)
server <- function(input, output, session) {
# CODE BELOW: Add an observer to display a notification
# 'You have entered the name xxxx' where xxxx is the name
observe({showNotification(paste0("You have entered the name ", input$name))})
}
shinyApp(ui = ui, server = server)
server <- function(input, output, session) {
rval_bmi <- reactive({
input$weight/(input$height^2)
})
output$bmi <- renderText({
bmi <- rval_bmi()
# MODIFY CODE BELOW:
# Use isolate to stop output from updating when name changes.
paste("Hi", isolate({input$name}), ". Your BMI is", round(bmi, 1))
})
}
ui <- fluidPage(
titlePanel('BMI Calculator'),
sidebarLayout(
sidebarPanel(
textInput('name', 'Enter your name'),
numericInput('height', 'Enter your height (in m)', 1.5, 1, 2, step = 0.1),
numericInput('weight', 'Enter your weight (in Kg)', 60, 45, 120)
),
mainPanel(
textOutput("bmi")
)
)
)
shinyApp(ui = ui, server = server)
server <- function(input, output, session) {
# MODIFY CODE BELOW: Use eventReactive to delay the execution of the
# calculation until the user clicks on the show_bmi button (Show BMI)
rval_bmi <- eventReactive(input$show_bmi, {
input$weight/(input$height^2)
})
output$bmi <- renderText({
bmi <- rval_bmi()
paste("Hi", input$name, ". Your BMI is", round(bmi, 1))
})
}
ui <- fluidPage(
titlePanel('BMI Calculator'),
sidebarLayout(
sidebarPanel(
textInput('name', 'Enter your name'),
numericInput('height', 'Enter height (in m)', 1.5, 1, 2, step = 0.1),
numericInput('weight', 'Enter weight (in Kg)', 60, 45, 120),
actionButton("show_bmi", "Show BMI")
),
mainPanel(
textOutput("bmi")
)
)
)
shinyApp(ui = ui, server = server)
server <- function(input, output, session) {
# MODIFY CODE BELOW: Wrap in observeEvent() so the help text
# is displayed when a user clicks on the Help button.
observeEvent(input$show_help, {
# Display a modal dialog with bmi_help_text
# MODIFY CODE BELOW: Uncomment code
showModal(modalDialog(bmi_help_text))
})
rv_bmi <- eventReactive(input$show_bmi, {
input$weight/(input$height^2)
})
output$bmi <- renderText({
bmi <- rv_bmi()
paste("Hi", input$name, ". Your BMI is", round(bmi, 1))
})
}
ui <- fluidPage(
titlePanel('BMI Calculator'),
sidebarLayout(
sidebarPanel(
textInput('name', 'Enter your name'),
numericInput('height', 'Enter your height in meters', 1.5, 1, 2),
numericInput('weight', 'Enter your weight in Kilograms', 60, 45, 120),
actionButton("show_bmi", "Show BMI"),
# CODE BELOW: Add an action button named "show_help"
actionButton("show_help", "Help")
),
mainPanel(
textOutput("bmi")
)
)
)
shinyApp(ui = ui, server = server)
server <- function(input, output, session) {
# MODIFY CODE BELOW: Delay the height calculation until
# the show button is pressed
rval_height_cm <- eventReactive(input$show_height_cm, {
input$height * 2.54
})
output$height_cm <- renderText({
height_cm <- rval_height_cm()
})
}
ui <- fluidPage(
titlePanel("Inches to Centimeters Conversion"),
sidebarLayout(
sidebarPanel(
numericInput("height", "Height (in)", 60),
actionButton("show_height_cm", "Show height in cm")
),
mainPanel(
textOutput("height_cm")
)
)
)
shinyApp(ui = ui, server = server)
Chapter 4 - Build Shiny Apps
Build an Alien Sightings Dashboard:
Explore the 2014 Mental Health Tech Survey:
Explore Cuisines:
Mass Shootings:
Wrap Up:
Example code includes:
usa_ufo_sightings <- readr::read_csv("./RInputFiles/usa_ufo_sightings.csv")
mental_health_survey <- readr::read_csv("./RInputFiles/mental_health_survey_edited.csv")
recipes <- readRDS("./RInputFiles/recipes.rds")
mass_shootings <- readr::read_csv("./RInputFiles/mass-shootings.csv")
str(usa_ufo_sightings, give.attr=FALSE)
str(mental_health_survey, give.attr=FALSE)
str(recipes, give.attr=FALSE)
str(mass_shootings, give.attr=FALSE)
states <- sort(unique(usa_ufo_sightings$state))
ui <- fluidPage(
# CODE BELOW: Add a title
titlePanel("UFO Sightings"),
sidebarLayout(
sidebarPanel(
# CODE BELOW: One input to select a U.S. state
# And one input to select a range of dates
selectInput("state", "Choose a U.S. state:", selected="AK", choices=states),
dateRangeInput("date", "Choose a date range:", start="1920-01-01", end="1950-01-01")
),
mainPanel()
)
)
server <- function(input, output) {
}
shinyApp(ui, server)
server <- function(input, output) {
# CODE BELOW: Create a plot output name 'shapes', of sightings by shape,
# For the selected inputs
output$shapes <- renderPlot({
usa_ufo_sightings %>%
filter(state == input$state,
date_sighted >= input$dates[1],
date_sighted <= input$dates[2]) %>%
ggplot(aes(shape)) +
geom_bar() +
labs(x = "Shape", y = "# Sighted")
})
# CODE BELOW: Create a table output named 'duration_table', by shape,
# of # sighted, plus mean, median, max, and min duration of sightings
# for the selected inputs
output$duration_table <- renderTable({
usa_ufo_sightings %>%
filter(
state == input$state,
date_sighted >= input$dates[1],
date_sighted <= input$dates[2]
) %>%
group_by(shape) %>%
summarize(
nb_sighted = n(),
avg_duration_min = mean(duration_sec) / 60,
median_duration_min = median(duration_sec) / 60,
min_duration_min = min(duration_sec) / 60,
max_duration_min = max(duration_sec) / 60
)
})
}
ui <- fluidPage(
titlePanel("UFO Sightings"),
sidebarLayout(
sidebarPanel(
selectInput("state", "Choose a U.S. state:", choices = unique(usa_ufo_sightings$state)),
dateRangeInput("dates", "Choose a date range:",
start = "1920-01-01",
end = "1950-01-01")
),
mainPanel(
# Add plot output named 'shapes'
plotOutput("shapes"),
# Add table output named 'duration_table'
tableOutput("duration_table")
)
)
)
shinyApp(ui, server)
ui <- fluidPage(
titlePanel("UFO Sightings"),
sidebarPanel(
selectInput("state", "Choose a U.S. state:", choices = unique(usa_ufo_sightings$state)),
dateRangeInput("dates", "Choose a date range:",
start = "1920-01-01",
end = "1950-01-01"
)
),
# MODIFY CODE BELOW: Create a tab layout for the dashboard
mainPanel(
tabsetPanel( tabPanel("Plot", plotOutput("shapes")), tabPanel("Table", tableOutput("duration_table")) )
)
)
server <- function(input, output) {
output$shapes <- renderPlot({
usa_ufo_sightings %>%
filter(
state == input$state,
date_sighted >= input$dates[1],
date_sighted <= input$dates[2]
) %>%
ggplot(aes(shape)) +
geom_bar() +
labs(
x = "Shape",
y = "# Sighted"
)
})
output$duration_table <- renderTable({
usa_ufo_sightings %>%
filter(
state == input$state,
date_sighted >= input$dates[1],
date_sighted <= input$dates[2]
) %>%
group_by(shape) %>%
summarize(
nb_sighted = n(),
avg_duration_min = mean(duration_sec) / 60,
median_duration_min = median(duration_sec) / 60,
min_duration_min = min(duration_sec) / 60,
max_duration_min = max(duration_sec) / 60
)
})
}
shinyApp(ui, server)
ui <- fluidPage(
# CODE BELOW: Add an appropriate title
titlePanel("2014 Mental Health in Tech Survey"),
sidebarPanel(
checkboxGroupInput("mental_health_consequence", "Do you think that discussing a mental health issue with your employer would have negative consequences?", choices=c("Maybe", "Yes", "No"), selected="Maybe"),
shinyWidgets::pickerInput("mental_vs_physical", "Do you feel that your employer takes mental health as seriously as physical health?", choices=c("Don't know", "Yes","No"), selected="Nothing selected")
),
mainPanel(
plotOutput("ageHist")
)
)
server <- function(input, output, session) {
# CODE BELOW: Build a histogram of the age of respondents
# Filtered by the two inputs
output$ageHist <- renderPlot({
mental_health_survey %>%
filter(mental_health_consequence==input$mental_health_consequence,
mental_vs_physical==input$mental_vs_physical
) %>%
ggplot(aes(x=Age)) +
geom_histogram()
})
}
shinyApp(ui, server)
server <- function(input, output, session) {
output$age <- renderPlot({
# MODIFY CODE BELOW: Add validation that user selected a 3rd input
validate(
need(
input$mental_vs_physical != "",
"Make a selection for mental vs. physical health."
)
)
mental_health_survey %>%
filter(
work_interfere == input$work_interfere,
mental_health_consequence %in% input$mental_health_consequence,
mental_vs_physical %in% input$mental_vs_physical
) %>%
ggplot(aes(Age)) +
geom_histogram()
})
}
ui <- fluidPage(
titlePanel("2014 Mental Health in Tech Survey"),
sidebarPanel(
shinyWidgets::sliderTextInput(
inputId = "work_interfere",
label = "If you have a mental health condition, do you feel that it interferes with your work?",
grid = TRUE,
force_edges = TRUE,
choices = c("Never", "Rarely", "Sometimes", "Often")
),
checkboxGroupInput(
inputId = "mental_health_consequence",
label = "Do you think that discussing a mental health issue with your employer would have negative consequences?",
choices = c("Maybe", "Yes", "No"),
selected = "Maybe"
),
shinyWidgets::pickerInput(
inputId = "mental_vs_physical",
label = "Do you feel that your employer takes mental health as seriously as physical health?",
choices = c("Don't Know", "No", "Yes"),
multiple = TRUE
)
),
mainPanel(
plotOutput("age")
)
)
shinyApp(ui, server)
oldRecipe <- recipes
cuisineList <- vector("list", nrow(oldRecipe))
for (thisRow in 1:nrow(recipes)) {
cuisineList[[thisRow]] <- data.frame(id=oldRecipe$id[thisRow], cuisine=oldRecipe$cuisine[thisRow],
ingredient=oldRecipe$ingredients[thisRow][[1]],
stringsAsFactors=FALSE
)
}
recipes <- bind_rows(cuisineList)
str(recipes)
ui <- fluidPage(
titlePanel('Explore Cuisines'),
sidebarLayout(
sidebarPanel(
# CODE BELOW: Add an input named "cuisine" to select a cuisine
selectInput("cuisine", "Select Cuisine", choices=unique(recipes$cuisine), selected="greek"),
# CODE BELOW: Add an input named "nb_ingredients" to select # of ingredients
sliderInput("nb_ingredients", "Select No. of Ingredients", min=1, max=100, value=10)
),
mainPanel(
# CODE BELOW: Add a DT output named "dt_top_ingredients"
DT::DTOutput("dt_top_ingredients")
)
)
)
server <- function(input, output, session) {
# CODE BELOW: Render the top ingredients in a chosen cuisine as
# an interactive data table and assign it to output object `dt_top_ingredients`
output$dt_top_ingredients <- DT::renderDT({
recipes %>%
filter(cuisine == input$cuisine) %>%
count(ingredient, name="nb_recipes") %>%
arrange(desc(nb_recipes)) %>%
head(input$nb_ingredients)
})
}
shinyApp(ui, server)
recipes_enriched <- recipes %>%
count(cuisine, ingredient, name="nb_recipes") %>%
tidytext::bind_tf_idf(term="ingredient", document="cuisine", n="nb_recipes")
str(recipes_enriched)
ui <- fluidPage(
titlePanel('Explore Cuisines'),
sidebarLayout(
sidebarPanel(
selectInput('cuisine', 'Select Cuisine', unique(recipes$cuisine)),
sliderInput('nb_ingredients', 'Select No. of Ingredients', 1, 100, 10),
),
mainPanel(
tabsetPanel(
# CODE BELOW: Add a plotly output named "plot_dt_ingredients"
tabPanel("Plot", plotly::plotlyOutput("plot_top_ingredients")),
tabPanel('Table', DT::DTOutput('dt_top_ingredients'))
)
)
)
)
server <- function(input, output, session) {
# CODE BELOW: Add a reactive expression named `rval_top_ingredients` that
# filters `recipes_enriched` for the selected cuisine and top ingredients
# based on the tf_idf value.
rval_top_ingredients <- reactive({
recipes_enriched %>%
filter(cuisine==input$cuisine) %>%
arrange(desc(tf_idf)) %>%
head(input$nb_ingredients)
})
# CODE BELOW: Render a horizontal bar plot of top ingredients and
# the tf_idf of recipes they get used in, and assign it to an output named
# `plot_top_ingredients`
output$plot_top_ingredients <- plotly::renderPlotly({
ggplot(rval_top_ingredients(), aes(x=ingredient, y=tf_idf)) +
geom_col() +
coord_flip()
})
output$dt_top_ingredients <- DT::renderDT({
recipes %>%
filter(cuisine == input$cuisine) %>%
count(ingredient, name = 'nb_recipes') %>%
arrange(desc(nb_recipes)) %>%
head(input$nb_ingredients)
})
}
shinyApp(ui, server)
# ui <- fluidPage(
# titlePanel('Explore Cuisines'),
# sidebarLayout(
# sidebarPanel(
# selectInput('cuisine', 'Select Cuisine', unique(recipes$cuisine)),
# sliderInput('nb_ingredients', 'Select No. of Ingredients', 5, 100, 20),
# ),
# mainPanel(
# tabsetPanel(
# CODE BELOW: Add `d3wordcloudOutput` named `wc_ingredients` in a `tabPanel`
# tabPanel("Word Cloud", wordcloud2::wordcloud2Output("wc_ingredients")),
# tabPanel('Plot', plotly::plotlyOutput('plot_top_ingredients')),
# tabPanel('Table', DT::DTOutput('dt_top_ingredients'))
# )
# )
# )
# )
# server <- function(input, output, session){
# CODE BELOW: Render an interactive wordcloud of top ingredients and
# the number of recipes they get used in, using `d3wordcloud::renderD3wordcloud`,
# and assign it to an output named `wc_ingredients`.
# output$wc_ingredients <- wordcloud2::renderWordcloud2({
# d <- rval_top_ingredients()
# wordcloud2::wordcloud2(d)
# })
# rval_top_ingredients <- reactive({
# recipes_enriched %>%
# filter(cuisine == input$cuisine) %>%
# arrange(desc(tf_idf)) %>%
# head(input$nb_ingredients) %>%
# mutate(ingredient = forcats::fct_reorder(ingredient, tf_idf), word=as.character(ingredient),
# freq=nb_recipes
# )
# })
# output$plot_top_ingredients <- plotly::renderPlotly({
# rval_top_ingredients() %>%
# ggplot(aes(x = ingredient, y = tf_idf)) +
# geom_col() +
# coord_flip()
# })
# output$dt_top_ingredients <- DT::renderDT({
# recipes %>%
# filter(cuisine == input$cuisine) %>%
# count(ingredient, name = 'nb_recipes') %>%
# arrange(desc(nb_recipes)) %>%
# head(input$nb_ingredients)
# })
# }
# shinyApp(ui = ui, server= server)
mass_shootings$date <- lubridate::mdy(mass_shootings$date)
ui <- bootstrapPage(
theme = shinythemes::shinytheme('simplex'),
leaflet::leafletOutput('map', width = '100%', height = '100%'),
absolutePanel(top = 10, right = 10, id = 'controls',
sliderInput('nb_fatalities', 'Minimum Fatalities', 1, 40, 10),
dateRangeInput(
'date_range', 'Select Date', "2010-01-01", "2019-12-01"
),
# CODE BELOW: Add an action button named show_about
actionButton("show_about", "About")
),
tags$style(type = "text/css", "
html, body {width:100%;height:100%}
#controls{background-color:white;padding:20px;}
")
)
server <- function(input, output, session) {
# CODE BELOW: Use observeEvent to display a modal dialog
# with the help text stored in text_about.
observeEvent(input$show_about, {
showModal(modalDialog(text_about, title="About"))
})
output$map <- leaflet::renderLeaflet({
mass_shootings %>%
filter(
date >= input$date_range[1],
date <= input$date_range[2],
fatalities >= input$nb_fatalities
) %>%
leaflet::leaflet() %>%
leaflet::setView( -98.58, 39.82, zoom = 5) %>%
leaflet::addTiles() %>%
leaflet::addCircleMarkers(
popup = ~ summary, radius = ~ sqrt(fatalities)*3,
fillColor = 'red', color = 'red', weight = 1
)
})
}
shinyApp(ui, server)
Chapter 1 - Statistics
Stats with Geoms:
Stats: Sum and Quantile:
Stats Outside Geoms:
Example code includes:
# View the structure of mtcars
data(mtcars)
str(mtcars)
## 'data.frame': 32 obs. of 11 variables:
## $ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
## $ cyl : num 6 6 4 6 8 6 8 4 4 6 ...
## $ disp: num 160 160 108 258 360 ...
## $ hp : num 110 110 93 110 175 105 245 62 95 123 ...
## $ drat: num 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
## $ wt : num 2.62 2.88 2.32 3.21 3.44 ...
## $ qsec: num 16.5 17 18.6 19.4 17 ...
## $ vs : num 0 0 1 1 0 1 0 1 1 1 ...
## $ am : num 1 1 1 0 0 0 0 0 0 0 ...
## $ gear: num 4 4 4 3 3 3 3 4 4 4 ...
## $ carb: num 4 4 1 1 2 1 4 2 2 4 ...
# Using mtcars, draw a scatter plot of mpg vs. wt
ggplot(mtcars, aes(x=wt, y=mpg)) +
geom_point()
# Amend the plot to add a smooth layer
ggplot(mtcars, aes(x = wt, y = mpg)) +
geom_point() +
geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
# Amend the plot. Use lin. reg. smoothing; turn off std err ribbon
ggplot(mtcars, aes(x = wt, y = mpg)) +
geom_point() +
geom_smooth(method="lm", se=FALSE)
# Amend the plot. Swap geom_smooth() for stat_smooth().
ggplot(mtcars, aes(x = wt, y = mpg)) +
geom_point() +
stat_smooth(method = "lm", se = FALSE)
mtcars <- mtcars %>%
mutate(fcyl=factor(cyl), fam=factor(am))
str(mtcars)
## 'data.frame': 32 obs. of 13 variables:
## $ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
## $ cyl : num 6 6 4 6 8 6 8 4 4 6 ...
## $ disp: num 160 160 108 258 360 ...
## $ hp : num 110 110 93 110 175 105 245 62 95 123 ...
## $ drat: num 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
## $ wt : num 2.62 2.88 2.32 3.21 3.44 ...
## $ qsec: num 16.5 17 18.6 19.4 17 ...
## $ vs : num 0 0 1 1 0 1 0 1 1 1 ...
## $ am : num 1 1 1 0 0 0 0 0 0 0 ...
## $ gear: num 4 4 4 3 3 3 3 4 4 4 ...
## $ carb: num 4 4 1 1 2 1 4 2 2 4 ...
## $ fcyl: Factor w/ 3 levels "4","6","8": 2 2 1 2 3 2 3 1 1 2 ...
## $ fam : Factor w/ 2 levels "0","1": 2 2 2 1 1 1 1 1 1 1 ...
# Using mtcars, plot mpg vs. wt, colored by fcyl
ggplot(mtcars, aes(x=wt, y=mpg, color=fcyl)) +
# Add a point layer
geom_point() +
# Add a smooth lin reg stat, no ribbon
stat_smooth(method="lm", se=FALSE)
# Amend the plot to add another smooth layer with dummy grouping
ggplot(mtcars, aes(x = wt, y = mpg, color = fcyl)) +
geom_point() +
stat_smooth(method = "lm", se = FALSE) +
stat_smooth(aes(group=1), method="lm", se=FALSE)
ggplot(mtcars, aes(x = wt, y = mpg)) +
geom_point() +
# Add 3 smooth LOESS stats, varying span & color
stat_smooth(method = "loess", color = "red", span = 0.9, se=FALSE) +
stat_smooth(method = "loess", color = "green", span = 0.6, se=FALSE) +
stat_smooth(method = "loess", color = "blue", span = 0.3, se=FALSE)
# Amend the plot to color by fcyl
ggplot(mtcars, aes(x = wt, y = mpg)) +
geom_point() +
# Add a smooth LOESS stat, no ribbon
stat_smooth(method="loess", se=FALSE) +
# Add a smooth lin. reg. stat, no ribbon
stat_smooth(method="lm", se=FALSE)
# Amend the plot
ggplot(mtcars, aes(x = wt, y = mpg, color = fcyl)) +
geom_point() +
# Map color to dummy variable "All"
stat_smooth(aes(color="All"), se = FALSE) +
stat_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
data(Vocab, package="carData")
Vocab <- Vocab %>%
mutate(year_group=factor(ifelse(year<=1994, 1974, 2016)))
str(Vocab)
## 'data.frame': 30351 obs. of 5 variables:
## $ year : num 1974 1974 1974 1974 1974 ...
## $ sex : Factor w/ 2 levels "Female","Male": 2 2 1 1 1 2 2 2 1 1 ...
## $ education : num 14 16 10 10 12 16 17 10 12 11 ...
## $ vocabulary: num 9 9 9 5 8 8 9 5 3 5 ...
## $ year_group: Factor w/ 2 levels "1974","2016": 1 1 1 1 1 1 1 1 1 1 ...
# Using Vocab, plot vocabulary vs. education, colored by year group
ggplot(Vocab, aes(x=education, y=vocabulary, color=year_group)) +
# Add jittered points with transparency 0.25
geom_jitter(alpha=0.25) +
# Add a smooth lin. reg. line (with ribbon)
stat_smooth(method="lm")
# Amend the plot
ggplot(Vocab, aes(x = education, y = vocabulary, color = year_group)) +
geom_jitter(alpha = 0.25) +
# Map the fill color to year_group, set the line size to 2
stat_smooth(method = "lm", aes(fill=year_group), size=2)
# Amend the plot to color by year_group
ggplot(Vocab, aes(x = education, y = vocabulary)) +
geom_jitter(alpha = 0.25) +
stat_quantile(quantiles = c(0.05, 0.5, 0.95))
## Smoothing formula not specified. Using: y ~ x
# Amend the plot to color by year_group
ggplot(Vocab, aes(x = education, y = vocabulary, color=year_group)) +
geom_jitter(alpha = 0.25) +
stat_quantile(quantiles = c(0.05, 0.5, 0.95))
## Smoothing formula not specified. Using: y ~ x
## Smoothing formula not specified. Using: y ~ x
## Warning in rq.fit.br(wx, wy, tau = tau, ...): Solution may be nonunique
# Run this, look at the plot, then update it
ggplot(Vocab, aes(x = education, y = vocabulary)) +
# Replace this with a sum stat
stat_sum()
ggplot(Vocab, aes(x = education, y = vocabulary)) +
stat_sum() +
# Add a size scale, from 1 to 10
scale_size(range=c(1, 10))
# Amend the stat to use proportion sizes
ggplot(Vocab, aes(x = education, y = vocabulary)) +
stat_sum(aes(size = ..prop..))
# Amend the plot to group by education
ggplot(Vocab, aes(x = education, y = vocabulary, group = education)) +
stat_sum(aes(size = ..prop..))
# From previous step
posn_j <- position_jitter(width = 0.2)
posn_d <- position_dodge(width = 0.1)
posn_jd <- position_jitterdodge(jitter.width = 0.2, dodge.width = 0.1)
# Create the plot base: wt vs. fcyl, colored by fam
p_wt_vs_fcyl_by_fam <- ggplot(mtcars, aes(x=fcyl, y=wt, color=fam))
# Add a point layer
p_wt_vs_fcyl_by_fam +
geom_point()
# Add jittering only
p_wt_vs_fcyl_by_fam +
geom_point(position=posn_j)
# Add dodging only
p_wt_vs_fcyl_by_fam +
geom_point(position=posn_d)
# Add jittering and dodging
p_wt_vs_fcyl_by_fam_jit <- p_wt_vs_fcyl_by_fam +
geom_point(position=posn_jd)
p_wt_vs_fcyl_by_fam_jit
p_wt_vs_fcyl_by_fam_jit +
# Add a summary stat of std deviation limits
stat_summary(fun.data=mean_sdl, fun.args=list(mult=1), position=posn_d)
p_wt_vs_fcyl_by_fam_jit +
# Change the geom to be an errorbar
stat_summary(fun.data = mean_sdl, fun.args = list(mult = 1), position = posn_d, geom="errorbar")
p_wt_vs_fcyl_by_fam_jit +
# Add a summary stat of normal confidence limits
stat_summary(fun.data = mean_cl_normal, position = posn_d)
Chapter 2 - Coordinates
Coordinates:
Coordinates vs Scales:
Double and Flipped Axes:
Polar Coordinates:
Example code includes:
ggplot(mtcars, aes(x = wt, y = hp, color = fam)) +
geom_point() +
geom_smooth() +
# Add Cartesian coordinates with x limits from 3 to 6
coord_cartesian(xlim=c(3, 6))
data(iris)
str(iris)
ggplot(iris, aes(x = Sepal.Length, y = Sepal.Width, color = Species)) +
geom_jitter() +
geom_smooth(method = "lm", se = FALSE) +
# Fix the coordinate ratio
coord_fixed(1)
data(sunspot.month)
str(sunspot.month)
sunspots <- data.frame(Date=lubridate::ymd("1749-1-1") + months(0:(length(sunspot.month)-1)),
Sunspots=as.numeric(sunspot.month)
)
str(sunspots)
sun_plot <- ggplot(sunspots, aes(x=Date, y=Sunspots)) +
geom_line(col="lightblue") +
geom_rect(aes(xmin=as.Date("1860-01-01"), xmax=as.Date("1935-01-01"), ymin=175, ymax=250),
col="orange", fill=NA
)
# Fix the aspect ratio to 1:1
sun_plot +
coord_fixed(1)
# Change the aspect ratio to 20:1
sun_plot +
coord_fixed(20)
ggplot(mtcars, aes(wt, mpg)) +
geom_point(size = 2) +
# Add Cartesian coordinates with zero expansion
coord_cartesian(expand=0) +
theme_classic()
ggplot(mtcars, aes(wt, mpg)) +
geom_point(size = 2) +
# Turn clipping off
coord_cartesian(expand = 0, clip="off") +
theme_classic() +
# Remove axis lines
theme(axis.line=element_blank())
data(msleep, package="ggplot2")
msleep <- msleep %>%
select(bodywt, brainwt, vore) %>%
filter(complete.cases(.))
str(msleep)
# Produce a scatter plot of brainwt vs. bodywt
ggplot(msleep, aes(x=bodywt, y=brainwt)) +
geom_point() +
ggtitle("Raw Values")
# Add scale_*_*() functions
ggplot(msleep, aes(bodywt, brainwt)) +
geom_point() +
scale_x_log10() +
scale_y_log10() +
ggtitle("Scale_ functions")
# Perform a log10 coordinate system transformation
ggplot(msleep, aes(bodywt, brainwt)) +
geom_point() +
coord_trans(x="log10", y="log10")
# Plot with a scale_*_*() function:
ggplot(msleep, aes(bodywt, brainwt)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
# Add a log10 x scale
scale_x_log10() +
# Add a log10 y scale
scale_y_log10() +
ggtitle("Scale functions")
# Plot with transformed coordinates
ggplot(msleep, aes(bodywt, brainwt)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
# Add a log10 coordinate transformation for x and y axes
coord_trans(x="log10", y="log10")
data(airquality)
airquality <- airquality %>%
mutate(Date=lubridate::ymd(paste0("1973-", Month, "-", Day)))
str(airquality)
# Using airquality, plot Temp vs. Date
ggplot(airquality, aes(x=Date, y=Temp)) +
# Add a line layer
geom_line() +
labs(x = "Date (1973)", y = "Fahrenheit")
# Define breaks (Fahrenheit)
y_breaks <- c(59, 68, 77, 86, 95, 104)
# Convert y_breaks from Fahrenheit to Celsius
y_labels <- (y_breaks - 32) / 1.8
# Create a secondary x-axis
secondary_y_axis <- sec_axis(
# Use identity transformation
trans = "identity",
name = "Celsius",
# Define breaks and labels as above
breaks = y_breaks,
labels = y_labels
)
# Examine the object
secondary_y_axis
# From previous step
y_breaks <- c(59, 68, 77, 86, 95, 104)
y_labels <- (y_breaks - 32) * 5 / 9
secondary_y_axis <- sec_axis(
trans = identity,
name = "Celsius",
breaks = y_breaks,
labels = y_labels
)
# Update the plot
ggplot(airquality, aes(Date, Temp)) +
geom_line() +
# Add the secondary y-axis
scale_y_continuous(sec.axis = secondary_y_axis) +
labs(x = "Date (1973)", y = "Fahrenheit")
# Plot fcyl bars, filled by fam
ggplot(mtcars, aes(x=fcyl, fill = fam)) +
# Place bars side by side
geom_bar(position = "dodge")
ggplot(mtcars, aes(fcyl, fill = fam)) +
# Set a dodge width of 0.5 for partially overlapping bars
geom_bar(position = position_dodge(width=0.5)) +
coord_flip()
mtcars$car <- c('Mazda RX4', 'Mazda RX4 Wag', 'Datsun 710', 'Hornet 4 Drive', 'Hornet Sportabout', 'Valiant', 'Duster 360', 'Merc 240D', 'Merc 230', 'Merc 280', 'Merc 280C', 'Merc 450SE', 'Merc 450SL', 'Merc 450SLC', 'Cadillac Fleetwood', 'Lincoln Continental', 'Chrysler Imperial', 'Fiat 128', 'Honda Civic', 'Toyota Corolla', 'Toyota Corona', 'Dodge Challenger', 'AMC Javelin', 'Camaro Z28', 'Pontiac Firebird', 'Fiat X1-9', 'Porsche 914-2', 'Lotus Europa', 'Ford Pantera L', 'Ferrari Dino', 'Maserati Bora', 'Volvo 142E')
str(mtcars)
# Plot of wt vs. car
ggplot(mtcars, aes(x=car, y=wt)) +
# Add a point layer
geom_point() +
labs(x = "car", y = "weight")
# Flip the axes to set car to the y axis
ggplot(mtcars, aes(car, wt)) +
geom_point() +
labs(x = "car", y = "weight") +
coord_flip()
ggplot(mtcars, aes(x = 1, fill = fcyl)) +
# Reduce the bar width to 0.1
geom_bar(width=0.1) +
coord_polar(theta = "y") +
# Add a continuous x scale from 0.5 to 1.5
scale_x_continuous(limits=c(0.5, 1.5))
dirs <- c("N", "NNE", "NE", "ENE", "E", "ESE", "SE", "SSE",
"S", "SSW", "SW", "WSW", "W", "WNW", "NW", "NNW"
)
data(mydata, package="openair")
wind <- mydata %>%
select(date, ws, wd) %>%
filter(date >= as.Date("2003-01-01"), date <= as.Date("2003-12-31")) %>%
mutate(orig_ws=ws, orig_wd=wd, base_ws=2 * (ws %/% 2),
base_wd=round(((wd + 11.25) %% 360) %/% 22.5),
ws=factor(ifelse(base_ws>=20, "20+", paste0(base_ws, "-", base_ws+2)),
levels=c("20+", "18-20", "16-18", "14-16", "12-14", "10-12", "8-10",
"6-8", "4-6", "2-4", "0-2")
),
wd=factor(dirs[base_wd+1], levels=dirs)
) %>%
filter(complete.cases(.))
str(wind)
# Using wind, plot wd filled by ws
ggplot(wind, aes(x=wd, fill=ws)) +
# Add a bar layer with width 1
geom_bar(width=1)
# Convert to polar coordinates:
ggplot(wind, aes(wd, fill = ws)) +
geom_bar(width = 1) +
coord_polar()
# Convert to polar coordinates:
ggplot(wind, aes(wd, fill = ws)) +
geom_bar(width = 1) +
coord_polar(start = -pi/16)
Chapter 3 - Facets
Facets Layer:
Facet Labels and Order:
Facet Plotting Spaces:
Facet Wrap and Margins:
Example code includes:
ggplot(mtcars, aes(wt, mpg)) +
geom_point() +
# Facet rows by am
facet_grid(rows=vars(am))
ggplot(mtcars, aes(wt, mpg)) +
geom_point() +
# Facet columns by cyl
facet_grid(cols=vars(cyl))
ggplot(mtcars, aes(wt, mpg)) +
geom_point() +
# Facet rows by am and columns by cyl
facet_grid(rows=vars(am), cols=vars(cyl))
# See the interaction column
mtcars <- mtcars %>%
mutate(fcyl_fam=factor(paste0(fcyl, "_", fam)))
mtcars$fcyl_fam
# Color the points by fcyl_fam
ggplot(mtcars, aes(x = wt, y = mpg, color = fcyl_fam)) +
geom_point() +
# Use a paired color palette
scale_color_brewer(palette = "Paired")
# Update the plot to map disp to size
ggplot(mtcars, aes(x = wt, y = mpg, color = fcyl_fam, size=disp)) +
geom_point() +
scale_color_brewer(palette = "Paired")
# Update the plot
ggplot(mtcars, aes(x = wt, y = mpg, color = fcyl_fam, size = disp)) +
geom_point() +
scale_color_brewer(palette = "Paired") +
# Grid facet on gear and vs
facet_grid(rows = vars(gear), cols = vars(vs))
ggplot(mtcars, aes(wt, mpg)) +
geom_point() +
# Facet rows by am using formula notation
facet_grid(am ~ .)
ggplot(mtcars, aes(wt, mpg)) +
geom_point() +
# Facet columns by cyl using formula notation
facet_grid(. ~ cyl)
ggplot(mtcars, aes(wt, mpg)) +
geom_point() +
# Facet rows by am and columns by cyl using formula notation
facet_grid(am ~ cyl)
# Plot wt by mpg
ggplot(mtcars, aes(wt, mpg)) +
geom_point() +
# The default is label_value
facet_grid(cols = vars(cyl))
# Plot wt by mpg
ggplot(mtcars, aes(wt, mpg)) +
geom_point() +
# Displaying both the values and the variables
facet_grid(cols = vars(cyl), labeller = label_both)
# Plot wt by mpg
ggplot(mtcars, aes(wt, mpg)) +
geom_point() +
# Label context
facet_grid(cols = vars(cyl), labeller = label_context)
# Plot wt by mpg
ggplot(mtcars, aes(wt, mpg)) +
geom_point() +
# Two variables
facet_grid(cols = vars(vs, cyl), labeller = label_context)
# Make factor, set proper labels explictly
mtcars$fam <- factor(mtcars$am, labels = c(`0` = "automatic", `1` = "manual"))
# Default order is alphabetical
ggplot(mtcars, aes(wt, mpg)) +
geom_point() +
facet_grid(cols = vars(fam))
# Make factor, set proper labels explictly, and
# manually set the label order
mtcars$fam <- factor(mtcars$am, levels = c(1, 0), labels = c("manual", "automatic"))
# View again
ggplot(mtcars, aes(wt, mpg)) +
geom_point() +
facet_grid(cols = vars(fam))
ggplot(mtcars, aes(wt, mpg)) +
geom_point() +
# Facet columns by cyl
facet_grid(cols=vars(cyl))
ggplot(mtcars, aes(wt, mpg)) +
geom_point() +
# Update the faceting to free the x-axis scales
facet_grid(cols = vars(cyl), scales="free_x")
ggplot(mtcars, aes(wt, mpg)) +
geom_point() +
# Swap cols for rows; free the y-axis scales
facet_grid(rows = vars(cyl), scales = "free_y")
ggplot(mtcars, aes(x = mpg, y = car, color = fam)) +
geom_point() +
# Facet rows by gear
facet_grid(rows=vars(gear))
ggplot(mtcars, aes(x = mpg, y = car, color = fam)) +
geom_point() +
# Free the y scales and space
facet_grid(rows = vars(gear), scales="free_y", space="free_y")
ggplot(Vocab, aes(x = education, y = vocabulary)) +
stat_smooth(method = "lm", se = FALSE) +
# Create facets, wrapping by year, using vars()
facet_wrap(vars(year))
ggplot(Vocab, aes(x = education, y = vocabulary)) +
stat_smooth(method = "lm", se = FALSE) +
# Create facets, wrapping by year, using a formula
facet_wrap(~ year)
ggplot(Vocab, aes(x = education, y = vocabulary)) +
stat_smooth(method = "lm", se = FALSE) +
# Update the facet layout, using 11 columns
facet_wrap(~ year, ncol=11)
mtcars <- mtcars %>%
mutate(fam=factor(am, levels=c(0, 1), labels=c("automatic", "manual")),
fvs=factor(vs, levels=c(0, 1), labels=c("V-shaped", "straight"))
)
str(mtcars)
ggplot(mtcars, aes(x = wt, y = mpg)) +
geom_point() +
# Facet rows by fvs and cols by fam
facet_grid(rows=vars(fvs, fam), cols=vars(gear))
ggplot(mtcars, aes(x = wt, y = mpg)) +
geom_point() +
# Update the facets to add margins
facet_grid(rows = vars(fvs, fam), cols = vars(gear), margins=TRUE)
ggplot(mtcars, aes(x = wt, y = mpg)) +
geom_point() +
# Update the facets to only show margins on fam
facet_grid(rows = vars(fvs, fam), cols = vars(gear), margins = "fam")
ggplot(mtcars, aes(x = wt, y = mpg)) +
geom_point() +
# Update the facets to only show margins on gear and fvs
facet_grid(rows = vars(fvs, fam), cols = vars(gear), margins = c("gear", "fvs"))
Chapter 4 - Best Practices
Best Practices: Bar Plots:
Heatmaps: Use Case Scenario:
Good Data can make Bad Plots:
Example code includes:
# Plot wt vs. fcyl
ggplot(mtcars, aes(x = fcyl, y = wt)) +
# Add a bar summary stat of means, colored skyblue
stat_summary(fun.y = mean, geom = "bar", fill = "skyblue") +
# Add an errorbar summary stat std deviation limits
stat_summary(fun.data = mean_sdl, fun.args = list(mult = 1), geom = "errorbar", width = 0.1)
# Update the aesthetics to color and fill by fam
ggplot(mtcars, aes(x = fcyl, y = wt, color=fam, fill=fam)) +
stat_summary(fun.y = mean, geom = "bar") +
stat_summary(fun.data = mean_sdl, fun.args = list(mult = 1), geom = "errorbar", width = 0.1)
# Set alpha for the first and set position for each stat summary function
ggplot(mtcars, aes(x = fcyl, y = wt, color = fam, fill = fam)) +
stat_summary(fun.y = mean, geom = "bar", alpha = 0.5, position = "dodge") +
stat_summary(fun.data = mean_sdl, fun.args = list(mult = 1), geom = "errorbar", position = "dodge", width = 0.1)
# Define a dodge position object with width 0.9
posn_d <- position_dodge(width=0.9)
# For each summary stat, update the position to posn_d
ggplot(mtcars, aes(x = fcyl, y = wt, color = fam, fill = fam)) +
stat_summary(fun.y = mean, geom = "bar", position = posn_d, alpha = 0.5) +
stat_summary(fun.data = mean_sdl, fun.args = list(mult = 1), width = 0.1, position = posn_d, geom = "errorbar")
mtcars_by_cyl <- mtcars %>%
group_by(cyl) %>%
summarize(mean_wt=mean(wt), sd_wt=sd(wt), n_wt=n()) %>%
mutate(prop=n_wt/sum(n_wt))
mtcars_by_cyl
# Using mtcars_cyl, plot mean_wt vs. cyl
ggplot(mtcars_by_cyl, aes(x=cyl, y=mean_wt)) +
# Add a bar layer with identity stat, filled skyblue
geom_bar(stat="identity", fill="skyblue")
ggplot(mtcars_by_cyl, aes(x = cyl, y = mean_wt)) +
# Swap geom_bar() for geom_col()
geom_col(fill = "skyblue")
ggplot(mtcars_by_cyl, aes(x = cyl, y = mean_wt)) +
# Set the width aesthetic to prop
geom_col(fill = "skyblue", aes(width=prop))
ggplot(mtcars_by_cyl, aes(x = cyl, y = mean_wt)) +
geom_col(aes(width = prop), fill = "skyblue") +
# Add an errorbar layer
geom_errorbar(
# ... at mean weight plus or minus 1 std dev
aes(ymin=mean_wt-sd_wt, ymax=mean_wt+sd_wt),
# with width 0.1
width=0.1
)
data(barley, package="lattice")
str(barley)
# Using barley, plot variety vs. year, filled by yield
ggplot(barley, aes(x=year, y=variety, fill=yield)) +
# Add a tile geom
geom_tile()
# Previously defined
ggplot(barley, aes(x = year, y = variety, fill = yield)) +
geom_tile() +
# Facet, wrapping by site, with 1 column
facet_wrap(facets = vars(site), ncol = 1) +
# Add a fill scale using an 2-color gradient
scale_fill_gradient(low = "white", high = "red")
# A palette of 9 reds
red_brewer_palette <- RColorBrewer::brewer.pal(9, "Reds")
# Update the plot
ggplot(barley, aes(x = year, y = variety, fill = yield)) +
geom_tile() +
facet_wrap(facets = vars(site), ncol = 1) +
# Update scale to use n-colors from red_brewer_palette
scale_fill_gradientn(colors=red_brewer_palette)
# The heat map we want to replace
# Don't remove, it's here to help you!
ggplot(barley, aes(x = year, y = variety, fill = yield)) +
geom_tile() +
facet_wrap( ~ site, ncol = 1) +
scale_fill_gradientn(colors = RColorBrewer::brewer.pal(9, "Reds"))
# Using barley, plot yield vs. year, colored and grouped by variety
ggplot(barley, aes(x=year, y=yield, color=variety, group=variety)) +
# Add a line layer
geom_line() +
# Facet, wrapping by site, with 1 row
facet_wrap( ~ site, nrow = 1)
# Using barely, plot yield vs. year, colored, grouped, and filled by site
ggplot(barley, aes(x = year, y = yield, color = site, group = site, fill = site)) +
# Add a line summary stat aggregated by mean
stat_summary(fun.y = mean, geom = "line") +
# Add a ribbon summary stat with 10% opacity, no color
stat_summary(fun.data = mean_sdl, fun.args = list(mult = 1), geom = "ribbon", alpha = 0.1, color = NA)
data(ToothGrowth)
TG <- ToothGrowth
str(TG)
# Initial plot
growth_by_dose <- ggplot(TG, aes(dose, len, color = supp)) +
stat_summary(fun.data = mean_sdl, fun.args = list(mult = 1), position = position_dodge(0.1)) +
theme_classic()
# View plot
growth_by_dose
# Change type
TG$dose <- as.numeric(as.character(TG$dose))
# Plot
growth_by_dose <- ggplot(TG, aes(dose, len, color = supp)) +
stat_summary(fun.data = mean_sdl,
fun.args = list(mult = 1),
position = position_dodge(0.2)) +
stat_summary(fun.y = mean,
geom = "line",
position = position_dodge(0.1)) +
theme_classic() +
# Adjust labels and colors:
labs(x = "Dose (mg/day)", y = "Odontoblasts length (mean, standard deviation)", color = "Supplement") +
scale_color_brewer(palette = "Set1", labels = c("Orange juice", "Ascorbic acid")) +
scale_y_continuous(limits = c(0,35), breaks = seq(0, 35, 5), expand = c(0,0))
# View plot
growth_by_dose
Chapter 1 - Probability Distributions
Discrete Distributions:
Continuous Distributions:
Central Limit Theorem:
Example code includes:
set.seed(123)
# Generate the outcomes of basketball shots
shots <- rbinom(n = 10, size = 1, p = 0.5)
print(shots)
# Draw the frequency chart of the results
barplot(table(shots))
set.seed(123)
# Generate the outcomes of basketball shots
shots <- rbinom(n = 10, size = 1, prob = 0.3)
print(shots)
# Draw the frequency chart of the results
barplot(table(shots))
set.seed(123)
# Generate the outcomes of basketball shots
shots <- rbinom(n = 10, size = 1, prob = 0.9)
print(shots)
# Draw the frequency chart of the results
barplot(table(shots))
# The probability of getting 6 tails
six_tails <- dbinom(x = 6, size = 10, p = 0.5)
print(six_tails)
# The probability of getting 7 or less tails
seven_or_less <- pbinom(q = 7, size = 10, p = 0.5)
print(seven_or_less)
# The probability of getting 5 or more tails
five_or_more <- 1 - pbinom(q = 4, size = 10, p = 0.5)
print(five_or_more)
# Probability that X is lower than 7
lower_than_seven <- punif(q = 7, min = 1, max = 10)
print(lower_than_seven)
# Probability that X is lower or equal to 4
four_or_lower <- punif(q = 4, min = 1, max = 10)
print(four_or_lower)
# Probability that X falls into the range [4, 7]
between_four_and_seven <- lower_than_seven - four_or_lower
print(between_four_and_seven)
set.seed(123)
# Set the sample size
n = 50000
# Generate random samples from three distributions
sample_N01 <- rnorm(n)
sample_N03 <- rnorm(n, mean = 0, sd = sqrt(3))
sample_N21 <- rnorm(n, mean = 2, sd = 1)
# Visualize the distributions
data <- data.frame(sample_N01, sample_N03, sample_N21)
data %>% gather(key = distribution, value) %>%
ggplot(aes(x = value, fill = distribution)) +
geom_density(alpha = 0.3)
set.seed(123)
# Generate data points
data <- rnorm(n = 1000)
# Inspect the distribution
hist(data)
# Compute the true probability and print it
true_probability <- 1 - pnorm(q = 2)
print(true_probability)
# Compute the sample probability and print it
sample_probability <- mean(data > 2)
print(sample_probability)
set.seed(1)
# Create a sample of 20 die rolls
small_sample <- sample(1:6, size = 20, replace = TRUE)
# Calculate the mean of the small sample
mean(small_sample)
# Create a sample of 1000 die rolls
big_sample <- sample(1:6, size = 1000, replace = TRUE)
# Calculate the mean of the big sample
mean(big_sample)
die_outputs <- vector("integer", 1000)
mean_die_outputs <- vector("numeric", 1000)
# Simulate 1000 die roll outputs
for (i in 1:1000) {
die_outputs[i] <- sample(1:6, size = 1)
}
# Visualize the number of occurrences of each result
barplot(table(die_outputs))
# Calculate 1000 means of 30 die roll outputs
for (i in 1:1000) {
mean_die_outputs[i] <- mean(sample(1:6, size = 30, replace = TRUE))
}
# Inspect the distribution of the results
hist(mean_die_outputs)
Chapter 2 - Exploratory Data Analysis
Descriptive Statistics:
Categorical Data:
Time Series:
Principal Component Analysis:
Example code includes:
data(cats, package="MASS")
str(cats)
# Compute the average of Hwt
mean(cats$Hwt)
# Compute the median of Hwt
median(cats$Hwt)
# Inspect the distribution of Hwt
hist(cats$Hwt)
# Subset female cats
female_cats <- subset(cats, Sex == "F")
# Compute the variance of Bwt for females
var(female_cats$Bwt)
# Subset male cats
male_cats <- subset(cats, Sex == "M")
# Compute the variance of Bwt for males
var(male_cats$Bwt)
data(survey, package="MASS")
str(survey)
# Return the structure of Exer
str(survey$Exer)
# Create the ordered factor
survey$Exer_ordered <- factor(survey$Exer, levels = c("None", "Some", "Freq"), ordered = TRUE)
# Return the structure of Exer_ordered
str(survey$Exer_ordered)
# Build a contingency table for Exer_ordered
table(survey$Exer_ordered)
# Compute mean pulse for groups
tapply(survey$Pulse, survey$Exer_ordered, mean, na.rm = TRUE)
library(caret)
surveyCC <- survey[complete.cases(survey), ]
str(surveyCC)
# Fit a linear model
lm(Pulse ~ Exer, data = surveyCC)
# Create one hot encoder
encoder <- caret::dummyVars(~ Exer, data = surveyCC)
# Encode Exer
Exer_encoded <- predict(encoder, newdata = surveyCC)
# Bind intercept and independent variables
X <- cbind(1, Exer_encoded[, 2:3])
# Compute coefficients
solve((t(X)%*%X))%*%t(X)%*%surveyCC$Pulse
library(xts)
gas <- readr::read_csv("./RInputFiles/natural_gas_monthly.xls")
# View the structure of gas
str(gas)
# Coerce to date class
gas$Date <- as.Date(paste0(gas$Month, "-", "01"))
# Create the xts object
gas_ts <- xts(x = gas$Price, order.by = gas$Date)
# Plot the time series
plot(gas_ts)
# Create the sequence of dates
dates_2014 <- seq(from = as.Date("2014-01-01"), to = as.Date("2014-12-31"), by = "1 day")
# Subset the time series
gas_2014 <- gas_ts[dates_2014]
# Plot the time series
plot(gas_2014)
# Compute monthly means
apply.monthly(gas_2014, mean)
# Plot the unrotated data
plot(Bwt ~ Hwt, data = cats)
# Perform PCA
pca_cats <- prcomp(~ Bwt + Hwt, data = cats)
# Compute the summary
summary(pca_cats)
# Compute the rotated data
principal_components <- predict(pca_cats)
# Plot the rotated data
plot(principal_components)
letter_recognition <- readr::read_csv("./RInputFiles/letter-recognition.data")
str(letter_recognition)
# Perform PCA on all predictive variables
pca_letters <- prcomp(letter_recognition[, -1])
# Output spread measures of principal components
summary(pca_letters)
# Perform PCA on all predictive variables
pca_letters <- prcomp(letter_recognition[, -1], tol = 0.25)
# Output spread measures of principal components
summary(pca_letters)
# Perform PCA on all predictive variables
pca_letters <- prcomp(letter_recognition[, -1], rank = 7)
# Output spread measures of principal components
summary(pca_letters)
Chapter 3 - Statistical Tests
Normality Tests:
Inference for a Mean:
Comparing Two Means:
ANOVA:
Example code includes:
# Plot the distribution of Hwt
hist(cats$Hwt)
# Assess the normality of Hwt numerically
shapiro.test(cats$Hwt)
# Plot the distribution of the logarithm of Hwt
hist(log(cats$Hwt))
# Assess the normality of the logarithm of Hwt numerically
shapiro.test(log(cats$Hwt))
# Draw a Q-Q plot for Hwt
qqnorm(cats$Hwt)
# Add a reference line
qqline(cats$Hwt)
# Draw a Q-Q plot for logarithm of Hwt
qqnorm(log(cats$Hwt))
# Add a reference line
qqline(log(cats$Hwt))
data(sleep)
str(sleep)
# Test normality of extra
shapiro.test(sleep$extra)
# Calculate mean of extra
mean(sleep$extra)
# Derive 95% confidence interval
t.test(sleep$extra)$conf.int
# Derive 90% confidence interval
t.test(sleep$extra, conf.level = 0.9)$conf.int
# Derive 99% confidence interval
t.test(sleep$extra, conf.level = 0.99)$conf.int
# Subset data for group 1
group1 <- subset(sleep, group == 1)
# Subset data for group 2
group2 <- subset(sleep, group == 2)
# Test if mean of extra for group 1 amounts to 2.2
t.test(group1$extra, mu = 2.2)
# Test if mean of extra for group 2 amounts to 2.2
t.test(group2$extra, mu = 2.2)
# Test normality of sample 1
# shapiro.test(df$value[df$sample == 1])
# Test normality of sample 2
# shapiro.test(df$value[df$sample == 2])
# Test equality of variances
# bartlett.test(value ~ sample, data = df)
# Test equality of means
# t.test(value ~ sample, data = df, var.equal = TRUE)
# Subset the first group
drug1 <- sleep$extra[sleep$group == 1]
# Subset the second group
drug2 <- sleep$extra[sleep$group == 2]
# Perform paired test
t.test(drug1, drug2, paired = TRUE)
data(PlantGrowth)
str(PlantGrowth)
# Calculate means across groups
tapply(PlantGrowth$weight, PlantGrowth$group, FUN = mean)
# Graphically compare statistics across groups
boxplot(weight ~ group, data = PlantGrowth)
# Test normality across groups
tapply(PlantGrowth$weight, PlantGrowth$group, shapiro.test)
# Check the homogeneity of variance
bartlett.test(weight ~ group, data = PlantGrowth)
# Perform one-way ANOVA
# oneway.test(weight ~ group, data = PlantGrowth, var.equal = TRUE)
stats::anova(lm(weight ~ group, data = PlantGrowth))
Chapter 4 - Regression Models
Covariance and Correlation:
Linear Regression Model:
Logistic Regression Model:
Model Evaluation:
Wrap Up:
Example code includes:
dfData <- c(28.76, 78.83, 40.9, 88.3, 94.05, 4.56, 52.81, 89.24, 55.14, 45.66, 95.68, 45.33, 67.76, 57.26, 10.29, 89.98, 24.61, 4.21, 32.79, 95.45, 88.95, 69.28, 64.05, 99.43, 65.57, 70.85, 54.41, 59.41, 28.92, 14.71, 96.3, 90.23, 69.07, 79.55, 2.46, 47.78, 75.85, 21.64, 31.82, 23.16, 14.28, 41.45, 41.37, 36.88, 15.24, 13.88, 23.3, 46.6, 26.6, 85.78, 4.58, 44.22, 79.89, 12.19, 56.09, 20.65, 12.75, 75.33, 89.5, 37.45, 66.51, 9.48, 38.4, 27.44, 81.46, 44.85, 81.01, 81.24, 79.43, 43.98, 75.45, 62.92, 71.02, 0.06, 47.53, 22.01, 37.98, 61.28, 35.18, 11.11, 24.36, 66.81, 41.76, 78.82, 10.29, 43.49, 98.5, 89.31, 88.65, 17.51, 13.07, 65.31, 34.35, 65.68, 32.04, 18.77, 78.23, 9.36, 46.68, 51.15, 30.76, 75.49, 40.67, 97.39, 93.7, 12.36, 61.1, 91.42, 53.36, 38.6, 104.39, 41.36, 58.97, 66.22, 14.7, 82.83, 25.59, 13.29, 34.5, 93.54, 91.91, 65.68, 60.21, 93.82, 62.96, 80.54, 47.49, 51.24, 21.75, 18.51, 98.69, 98.06, 72.53, 84.29, 2.88, 50.98, 82.28, 27.37, 41.41, 21.95, 10.51, 39.64, 31.58, 30.56, 22.1, 8.5, 18.09, 38.13, 21.51, 90.43, 11.53, 44.17, 77.65, 7.12, 48.32, 18.45, 14.19, 69.67, 88.4, 31.81, 66.56, 6.56, 41.4, 24.93, 78.57, 45.53, 85.81, 75.66, 77.69, 39.3, 78.05, 56.6, 78.29, 4.99, 50.9, 24.37, 35.43, 61.87, 42.67, 12.75, 31.16, 63.05, 45.93, 74.12, 12.17, 43.12, 93.8, 90.6, 96.91, 25.54, 8.55, 61.74, 44.06, 68.08, 40.78, 18.1, 76.37, 12.54, 39.72, 52.61)
df <- as.data.frame(matrix(dfData, ncol=2, byrow=FALSE))
names(df) <- c("x", "y")
str(df)
# The number of observations
n <- nrow(df)
# Compute covariance by hand
sum((df$x-mean(df$x)) * (df$y-mean(df$y))) / (n-1)
# Compute covariance with function
cov(df$x, df$y)
data(women)
str(women)
# Draw the scatterplot
plot(women$height, women$weight)
# Compute the covariance
cov(women$height, women$weight)
# Compute the correlation
cor(women$height, women$weight)
houseData <- c(16262.6, 66343.2, 8907, 96334.9, 16710.3, 1890832.4, 263592, 397989.5, 136755.4, 1679175.3, 19530, 24728.1, 987014.9, 13057.8, 44255.4, 27170.6, 31520.6, 37652.6, 174642.9, 44566.1, 23860.6, 950070.3, 39273.2, 34267.5, 52135.5, 247637.1, 50883.4, 47937.6, 14601.3, 32638.7, 77357.2, 18250.2, 180188.6, 2857.9, 96317.9, 2658.7, 31527.6, 20692.1, 18138.9, 57671.8, 1280.3, 614049.3, 2297.9, 25049.4, 5998.6, 12426.8, 4036107.5, 66946.2, 4519.5, 2457.5, 153305, 54267.3, 32793.2, 8336, 3527.6, 8498.6, 426486.3, 15569.3, 3976.3, 2483242.7, 178146.7, 37004, 532820.6, 353502.4, 16109.9, 5030772.8, 30014.9, 4014.1, 45548.2, 112683.5, 6347094.8, 68913.7, 158747.5, 46736.7, 27082.3, 57508.8, 276772.2, 3800337.9, 470814.3, 632139.1, 4819.8, 422638.8, 104574.8, 2733, 180131.1, 45061.6, 1246044.4, 12549.3, 26280.4, 9647.9, 39796.7, 150966.1, 15561.3, 337988.7, 6263.6, 7784.4, 940960.3, 7412.9, 120751.3, 26649, 117.6, 130.8, 111.8, 133.2, 116.9, 165.3, 144.3, 148.2, 137.7, 163.6, 117.3, 120.8, 158.6, 117, 125.9, 122.1, 123.6, 124.4, 139.9, 126.5, 119.9, 156.9, 125.7, 126.4, 128, 144.3, 128.5, 129.2, 116.4, 123.5, 131.2, 118.2, 140.6, 99.6, 136.1, 99.3, 124, 119.4, 117, 128.9, 91.7, 153.5, 96.7, 120.7, 107.7, 115, 171.7, 130.3, 104.3, 97.2, 139, 129.6, 123.6, 111.4, 100.3, 108.5, 150, 117.6, 102.3, 167.4, 138.5, 125.2, 151.2, 147.7, 117.6, 174.1, 124.9, 101.5, 127.1, 134.2, 176.2, 132.1, 139.1, 128.5, 123.3, 129.3, 145.8, 171.5, 150.5, 154.2, 105.4, 149.7, 134.4, 100.7, 140.4, 126.8, 159.3, 114.7, 121.4, 111.5, 126.5, 138, 115.4, 146.6, 105.8, 109, 158.8, 109.7, 138.2, 122.4)
houses <- as.data.frame(matrix(houseData, ncol=2, byrow=FALSE))
names(houses) <- c("price", "area")
str(houses)
# Draw a scatterplot of price vs. area
plot(price ~ area, data = houses)
# Calculate the correlation coefficient of price and area
cor(houses$price, houses$area)
# Draw a histogram of price
hist(houses$price)
# Draw a scatterplot of log price vs. area
plot(log(price) ~ area, data = houses)
# Calculate the correlation coefficient of log price and area
cor(log(houses$price), houses$area)
# Draw the scatterplot
plot(Hwt ~ Bwt, data = cats)
# Fit the linear model
model <- lm(Hwt ~ Bwt, data = cats)
# Add the regression line
abline(model)
# Invoke diagnostic plots
plot(model)
# Print the new cat's data
new_cat <- data.frame(Bwt=2.55)
print(new_cat)
# Print the linear model
print(model)
# Calculate Hwt prediction
prediction <- -0.3567 + 4.0341 * 2.55
# Print the predicted value
print(prediction)
# Predict Hwt for the new cat
predict(model, newdata = new_cat)
parkData <- c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.022, 0.019, 0.013, 0.014, 0.018, 0.012, 0.006, 0.003, 0.011, 0.01, 0.012, 0.011, 0.006, 0.01, 0.006, 0.008, 0.019, 0.029, 0.032, 0.034, 0.039, 0.018, 0.013, 0.018, 0.018, 0.029, 0.011, 0.013, 0.007, 0.012, 0.003, 0.002, 0.001, 0.001, 0.001, 0.001, 0.006, 0.003, 0.002, 0.003, 0.002, 0.003, 0.007, 0.007, 0.005, 0.005, 0.005, 0.004, 0.008, 0.005, 0.005, 0.005, 0.005, 0.005, 0.01, 0.012, 0.01, 0.007, 0.008, 0.011, 0.009, 0.003, 0.003, 0.004, 0.003, 0.004, 0.022, 0.027, 0.049, 0.024, 0.026, 0.034, 0.004, 0.006, 0.005, 0.005, 0.009, 0.004, 0.011, 0.022, 0.018, 0.018, 0.012, 0.009, 0.055, 0.028, 0.032, 0.048, 0.042, 0.072, 0.087, 0.017, 0.019, 0.012, 0.008, 0.01, 0.009, 0.082, 0.103, 0.167, 0.315, 0.118, 0.259, 0.005, 0.002, 0.006, 0.002, 0.007, 0.002, 0.009, 0.007, 0.008, 0.013, 0.006, 0.01, 0.061, 0.016, 0.018, 0.009, 0.007, 0.024, 0.012, 0.02, 0.018, 0.02, 0.019, 0.018, 0.018, 0.017, 0.005, 0.016, 0.01, 0.009, 0.005, 0.03, 0.025, 0.023, 0.037, 0.026, 0.018, 0.025, 0.042, 0.017, 0.02, 0.01, 0.015, 0.075, 0.061, 0.081, 0.079, 0.11, 0.217, 0.163, 0.042, 0.046, 0.026, 0.032, 0.107, 0.038, 0.027, 0.021, 0.028, 0.027, 0.014, 0.039, 0.006, 0.005, 0.009, 0.013, 0.01, 0.01, 0.004, 0.004, 0.005, 0.006, 0.004, 0.004, 0.006, 0.005, 0.005, 0.006, 0.006, 0.006, 0.01, 0.012, 0.007, 0.014, 0.007, 0.007, 0.044, 0.028, 0.018, 0.107, 0.072, 0.044, 0.815, 0.82, 0.825, 0.819, 0.823, 0.825, 0.764, 0.763, 0.774, 0.798, 0.776, 0.793, 0.647, 0.666, 0.654, 0.658, 0.645, 0.605, 0.719, 0.686, 0.704, 0.699, 0.68, 0.687, 0.732, 0.738, 0.721, 0.727, 0.676, 0.724, 0.741, 0.742, 0.739, 0.742, 0.742, 0.743, 0.779, 0.784, 0.766, 0.758, 0.766, 0.759, 0.654, 0.634, 0.635, 0.639, 0.632, 0.635, 0.734, 0.754, 0.776, 0.76, 0.766, 0.786, 0.819, 0.812, 0.821, 0.818, 0.813, 0.817, 0.679, 0.686, 0.694, 0.683, 0.674, 0.682, 0.721, 0.729, 0.731, 0.727, 0.73, 0.733, 0.763, 0.79, 0.816, 0.807, 0.79, 0.816, 0.78, 0.79, 0.77, 0.779, 0.788, 0.772, 0.73, 0.728, 0.712, 0.741, 0.744, 0.746, 0.733, 0.714, 0.735, 0.698, 0.712, 0.706, 0.693, 0.714, 0.691, 0.675, 0.657, 0.643, 0.641, 0.722, 0.691, 0.72, 0.678, 0.7, 0.676, 0.741, 0.728, 0.712, 0.722, 0.722, 0.715, 0.663, 0.654, 0.676, 0.655, 0.583, 0.684, 0.656, 0.741, 0.733, 0.728, 0.736, 0.738, 0.737, 0.7, 0.719, 0.724, 0.735, 0.721, 0.723, 0.744, 0.707, 0.708, 0.709, 0.701, 0.696, 0.685, 0.666, 0.662, 0.633, 0.63, 0.574, 0.794, 0.769, 0.764, 0.776, 0.763, 0.768, 0.754, 0.67, 0.659, 0.652, 0.624, 0.647, 0.627, 0.676, 0.695, 0.684, 0.72, 0.673, 0.675, 0.628, 0.627, 0.628, 0.725, 0.646, 0.647, 0.757, 0.776, 0.767, 0.756, 0.761, 0.763, 0.746, 0.763, 0.778, 0.759, 0.769, 0.757, 0.67, 0.657, 0.654, 0.668, 0.664, 0.659, 0.684, 0.658, 0.683, 0.656, 0.644, 0.664)
parkinsons <- as.data.frame(matrix(data=parkData, ncol=3, byrow=FALSE))
names(parkinsons) <- c("status", "NHR", "DFA")
str(parkinsons)
# Plot status vs NHR
plot(status ~ NHR, data = parkinsons)
# Plot status vs DFA
plot(status ~ DFA, data = parkinsons)
# Fit the logistic model
model <- glm(status ~ NHR + DFA, data = parkinsons, family = binomial)
# Print the model
print(model)
# Print the new person's data
new_person <- data.frame(NHR=0.2, DFA=0.6)
print(new_person)
# Print the logistic model
print(model)
# Calculate the probability
probability <- 1/(1+exp(-(-8.707+49.188*0.2+12.702*0.6)))
# Print the probability
print(probability)
# Predict the probability for the new person
predict(model, newdata = new_person, type = "response")
set.seed(123)
# Generate train row numbers
train_rows <- sample(nrow(cats), round(0.8 * nrow(cats)))
# Derive the training set
train_set <- cats[train_rows, ]
# Derive the testing set
test_set <- cats[-train_rows, ]
# Fit the model
model <- lm(Hwt ~ Bwt, data = train_set)
# Assign Hwt from the test set to y
y <- test_set$Hwt
# Predict Hwt on the test set
y_hat <- predict(model, newdata = test_set)
# Derive the test set's size
n <- nrow(test_set)
# Calculate RMSE
sqrt((1/n) * sum((y-y_hat)^2))
# Calculate MAE
(1/n) * sum(abs(y-y_hat))
set.seed(123)
# Generate train row numbers
train_rows <- sample(nrow(parkinsons), round(0.8 * nrow(parkinsons)))
# Derive the training set
train <- parkinsons[train_rows, ]
# Derive the testing set
test <- parkinsons[-train_rows, ]
# Build a logistic model on the train data
model <- glm(status ~ NHR + DFA, data = train, family = "binomial")
# Calculate probabilities for the test data
probabilities <- predict(model, newdata = test, type = "response")
# Predict health status
predictions <- (probabilities > 0.5) * 1
# Derive the confusion matrix
cm <- table(test$status, predictions)
# Compute the recall
cm[2, 2]/(cm[2, 2] + cm[2, 1])
Chapter 1 - Regular Expressions: Writing Custom Patterns
Introduction:
Character Classes and Repetitions:
Pipe and Question Mark:
Example code includes:
movie_titles <- c('Karate Kid', 'The Twilight Saga: Eclispe', 'Knight & Day', 'Shrek Forever After 3D', 'Marmaduke.', 'Street Dance', 'Predators', 'StreetDance 3D', 'Robin Hood', 'Micmacs A Tire-Larigot', '50 Shades of Grey', 'Sex And the City 2', 'Inception', 'The Dark Knight', '300', 'Toy Story 3 In Disney Digital 3D', '50 Shades of Gray', 'Italien, Le', 'Tournee', 'The A-Team', 'El Secreto De Sus Ojos', 'Kiss & Kill', 'The Road', 'Cosa Voglio Di Piu', 'Nur für dich', 'Prince Of Persia: The Sands Of Time', 'Saw 4', 'Saw 5', 'Saw 6', '21 Grams')
# Familiarize yourself with the vector by printing it
movie_titles
# List all movies that start with "The"
movie_titles[str_detect(movie_titles, pattern = "^The")]
# List all movies that end with "3D"
movie_titles[str_detect(movie_titles, pattern = "3D$")]
# Here's an example pattern that will find the movie Saw 4
str_match(movie_titles, pattern = "Saw 4")
# Match all sequels of the movie "Saw"
str_match(movie_titles, pattern = "Saw .")
# Match the letter K and three arbitrary characters
str_match(movie_titles, pattern = "^K...")
# Detect whether the movie titles end with a full stop
str_detect(movie_titles, pattern = "\\.$")
# List all movies that end with a space and a digit
movie_titles[str_detect(movie_titles, pattern = "\\s\\d$")]
# List all movies that contain "Grey" or "Gray"
movie_titles[str_detect(movie_titles, pattern = "Gr[ae]y")]
# List all movies with strange characters (no word or space)
movie_titles[str_detect(movie_titles, pattern = "[^\\w\\s]")]
# This lists all movies with two or more digits in a row
movie_titles[str_detect(movie_titles, pattern = "\\d{2,}")]
# List just the first words of every movie title
str_match(movie_titles, pattern = "\\w+")
# Match everything that comes before "Knight"
str_match(movie_titles, pattern = ".*Knight")
lines <- c('Karate Kid 2, Distributor: Columbia, 58 Screens', 'Finding Nemo, Distributors: Pixar and Disney, 10 Screens', 'Finding Harmony, Distributor: Unknown, 1 Screen', 'Finding Dory, Distributors: Pixar and Disney, 8 Screens')
# Append the three options: Match Nemo, Harmony or Dory
str_view(lines, pattern = "Finding Nemo|Harmony|Dory")
# Wrap the three options in parentheses and compare the results
str_view(lines, pattern = "Finding (Nemo|Harmony|Dory)")
# Use the pattern from above that matched the whole movie names
str_match(lines, pattern = "Finding (Nemo|Harmony|Dory)")
# Match both Screen and Screens by making the last "s" optional
str_match(lines, pattern = "Screens?")
# Match a random amount of arbitrary characters, followed by a comma
str_match(lines, pattern = ".*,")
# Match the same pattern followed by a comma, but the "lazy" way
str_match(lines, pattern = ".*?,")
Chapter 2 - Creating Strings with Data
Getting to Know Glue:
Collapsing Multiple Elements Into a String:
Gluing Regular Expressions:
Example code includes:
firstname <- "John"
lastname <- "Doe"
paste0(firstname, "'s last name is ", lastname, ".")
# Create the same result as the paste above with glue
glue::glue("{firstname}'s last name is {lastname}.")
# Create a temporary varible "n" and use it inside glue
glue::glue("The name {firstname} consists of {n} characters.", n = nchar(firstname))
users <- data.frame(name=c("Bryan", "Barbara", "Tom"), logins=c(6, 5, 3), stringsAsFactors=FALSE)
users
# Create two temporary variables "n" and "m" and use them
glue::glue("The data frame 'users' has {n} rows and {m} columns.", n = nrow(users), m = ncol(users))
# This lists the column names of the data frame users
colnames(users)
# Use them to create a sentence about the numbers of logins
users %>%
mutate(n_logins = glue::glue("{name} logged in {logins} times."))
fruits <- list("Apple", "Banana", "Cherries", "Dragon Fruit")
# Use ", " as a separator and ", or " between the last fruits
question <- glue::glue("Which of these do you prefer: {answers}?",
answers = glue::glue_collapse(fruits, sep = ", ", last = ", or ")
)
# Print question
print(question)
# List colnames separated a comma and a white space
glue::glue_collapse(colnames(users), sep = ", ")
# Use " and " for the last elements in glue_collapse
glue::glue("Our users are called {names}.",
names = glue::glue_collapse(users$name, sep = ", ", last = " and ")
)
# Use the same way to output also the "logins" of the users
glue::glue("Our users have logged in {logins} times.",
logins = glue::glue_collapse(users$logins, sep = ", ", last = " and ")
)
usersVec <- c('2019-11-23', 'Bryan: 6, bryan@gmail.com', 'Barbara: 5, barbara@aol.com', 'Tom: 3, tom@hotmail.com', 'Exported by MySQL')
usernames <- c("Bryan", "Barbara", "Tom")
# Create a pattern using the vector above separated by "or"s
user_pattern <- glue::glue_collapse(usernames, sep = "|")
str_view(usersVec, user_pattern)
politicians <- c('Bastien Girod', 'Balthasar Glättli', 'Marionna Schlatter', 'Katharina Prelicz Huber', 'Hans Egloff', 'Michael Töngi', 'Beat Jans', 'Johann Schneider-Ammann', 'Claudio Zanetti', 'Diana Gutjahr', 'Maximillian Reimann', 'Peter Schilliger', 'Hansjörg Knecht', 'Jacqueline Badran', 'Doris Leuthard', 'Mike Egger')
artText <- c('Die Bisherigen Bastien Girod und Balthasar Glättli müssen auf der Liste der Grünen für den Nationalrat zurückstehen.',
'Sie gehörte im vergangenen März zu den Gewinnern der Parlamentswahlen im Kanton Zürich. Die Grüne Partei legte im Kantonsrat neun Sitze zu und kommt nun auf 22 Vertreter im 180-köpfigen Kantonsparlament. Nun will die Partei vom Schwung profitieren und im nächsten Herbst auch im nationalen Parlament zulegen. An ihrer Nominations-Versammlung gestern in Zürich präsentierten die Grünen nun offiziell ihre Nationalratsliste und machten Parteipräsidentin Marionna Schlatter zu ihrer Ständerats-Kandidatin.',
'Geht man streng nach den Listenpositionen, ist Katharina Prelicz Huber das Zugpferd der Grünen Partei. Die Präsidentin der Gewerkschaft VPOD wurde auf Position 1 gesetzt. Zweifelsfrei ist sie eine verdiente Parteipolitikerin, sie sass von 2008 bis 2011 im Nationalrat und politisiert jetzt im Zürcher Gemeinderat. Für die grossen Glanzresultate der Grünen vermochte sie indes nicht zu sorgen. 2011 wurde sie abgwählt, vier Jahre danach holte sie deutlich weniger Stimmen als die beiden jetzigen Nationalräte Bastien Girod und Balthasar Glättli. Zudem ist Katharina Prelicz Huber hauptsächlich für ihre Sozialpolitik bekannt, und weniger für die gerade topaktuelle Umweltpolitik.',
'Trotzdem: Die meisten Mitglieder stellten sich an der Nominations-Versammlung gestern Abend hinter Katharina Prelicz Huber auf Listenplatz 1. Und auch Parteipräsidentin Marionna Schlatter verteidigte diese Wahl. «Viele hatte den Wunsch, dass die Grüne Partei zeigt, dass sie auch ältere, profiliertere Politikerinnen hat. Und das ist die Seite von Katharina Prelicz Huber.» Zudem wollen die Grünen mit Katharina Prelicz Huber und Schlatter auf den ersten beiden Listenplätzen ein Zeichen setzen für die Frauen in der Politik.')
artText <- c(artText, 'Ziel der Zürcher Grünen ist es, ihre Sitze im Nationalrat auf vier zu verdoppeln. Dabei bindet die Partei ihre beiden bekanntesten Politiker auf nationaler Ebene zurück. Balthasar Glättli belegt auf der Nationalratsliste der Grünen Position 3, Bastien Girod Position 4. Somit könnte den beiden prominenten Politikern die Abwahl drohen. Ein Spiel mit dem Feuer? Nein, sagt Bastien Girod selbst. Aber: «Die Bisherigen sollen sich nicht einfach auf den vorderen Plätzen ausruhen können.»',
'Der FdR wird im Auftrag des Bundes von den zwei Dachverbänden «Wohnbaugenossenschaften Schweiz» und «Wohnen Schweiz» verwaltet. Aus dem Fonds werden zinsgünstige Darlehen (bis max. 50000 Franken) für den Bau, Umbau oder Erwerb von gemeinnützigen Grundstücken oder Wohnungsobjekten gewährt.',
'Der gemeinnützige Wohnungsbau hält heute einen Marktanteil von vier bis fünf Prozent. Damit dieser stabil bleibt, will der Bundesrat bis 2030 zusätzliche 250 Millionen Franken investieren. Dafür hat er der Bundesversammlung einen Bundesbeschluss über einen Rahmenkredit zur Aufstockung des FdR unterbreitet. Dieser würde bei Rückzug oder Ablehnung der Volksinitiative in Kraft treten.',
'«Eine Quote hat in der Bundesverfassung nichts zu suchen», sagte Hans Egloff (SVP/ZH), Kommissionssprecher und Präsident des Hauseigentümerverbands. Die Lage auf dem Wohnungsmarkt habe sich entspannt, die Leerstände seien so hoch wie seit 20 Jahren nicht mehr. Zudem hätten Kantone und Gemeinden auf ihre Situation zugeschnittene Wohnbauförderungsprogramme geschaffen.',
'Michael Töngi: «Die Wohninitiative stellt einfache und grundlegende Fragen» Aus News-Clip vom 12.12.2018.', '«Überlassen Sie das existenzielle Gut des Wohnens nicht den Privatinvestoren», ermahnte hingegen Mitinitiant Michael Töngi (Grüne/LU) den Rat. Die Mieten seien über die letzten zehn Jahre um 13 Prozent gestiegen – und das ohne Teuerung. «Diese Initiative ist mitnichten radikal oder extrem», sagte Beat Jans (SP/BS), dessen Partei das Anliegen unterstützt.')
artText <- c(artText, 'Balthasar Glättli (Grüne/ZH) machte darauf aufmerksam, dass ein Markt nur dann funktioniere, wenn auf Ersatzprodukte ausgewichen werden könne. «Wohnen müssen wir aber alle», so der Fraktionspräsident. Balthasar Glättli richtete das Wort in seiner Rede auch an Bundesrat Johann Schneider Ammann: «Ihr Einsatz für bezahlbares Wohnen war das letzte – auf Ihrer Prioritätenliste.»',
'Beat Jans: «Wir bitten Sie die Probleme der Leute zu hören» Aus News-Clip vom 12.12.2018. Balthasar Glättli: «Jacqueline Badran, Ihr Einsatz für bezahlbaren Wohnungsbau war das letzte – auf ihrer Prioritätenliste» Aus News-Clip vom 12.12.2018. Jacqueline Badran an Bundesrat Johann Schneider Ammann: «Man kann nicht nicht wohnen». Aus News-Clip vom 12.12.2018.',
'Den bürgerlichen Parteien ging der staatliche Eingriff zu weit. Preisgünstige Wohnungen würden auch von Privaten angeboten. FDP und SVP lehnten die Volksinitiative ab. «Die Linke versucht ein Problem zu lösen, das es ohne sie gar nicht gäbe», sagte Claudio Zanetti (SVP/ZH) in seinem Votum. Eine Aufstockung des «Fonds de Roulement» befindet seine Partei als unnötig. Sie spricht sich gar für eine Auflösung des Fonds aus. Die FDP ist in der Frage gespalten.',
'Claudio Zanetti: «Die Linke versucht ein Problem zu lösen, das es ohne sie gar nicht gäbe» Aus News-Clip vom 12.12.2018. Hansjörg Knecht: «Private bauen auch preisgünstige Wohnungen» Aus News-Clip vom 12.12.2018. Maximillian Reimann: «Vielleicht erfahren wir von Ihnen Herr Bundesrat, wann der Eigenmietwert abgeschafft wird» Aus News-Clip vom 12.12.2018.',
'Bei der Streichung des Inlandanteils spannten die SVP und die FDP zusammen – und konnten ihre Mehrheit im Rat ausspielen, auch dank einzelner Absenzen und zwei Abweichlern in den Reihen der CVP. Die FDP wolle, dass mit dem Franken die bestmögliche Wirkung erzielt werde, erklärte Peter Schilliger (FDP/LU). Das sei mit Massnahmen im Ausland der Fall. Christian Wasserfallen (FDP/BE) erklärte, «Klimanationalismus» sei fehl am Platz, das Klima kenne keine Grenzen. Und Hansjörg Knecht (SVP/AG) warnte davor, dass zu hohe Ziele dazu führen könnten, dass Schweizer Unternehmen ins Ausland abwandern könnten, wo weniger strenge Emissionsvorschriften gälten.',
'Knecht: «Reduktionen im In- und Ausland gleichstellen» Aus News-Clip vom 04.12.2018.', 'Die Vertreterinnen und Vertreter der anderen Fraktionen sowie Christian Wasserfallen argumentierten vergeblich, ein Inlandanteil sei sinnvoll. Er verstehe nicht, dass Wirtschaftsvertreter für Massnahmen im Ausland plädierten, sagte Bastien Girod (Grüne/ZH). Für die Schweiz sei es eine grosse Chance, Lösungen zu entwickeln, die exportiert werden könnten und global wirkten.')
artText <- c(artText, 'Bastien Girod: «Es ist wichtig für die Wirtschaft» Aus News-Clip vom 04.12.2018.', 'Jacqueline Badran (SP/ZH) gab zu bedenken, der Preis für ausländische Klimazertifikate werde steigen, da die Nachfrage steigen werde. «Wieso sollten wir wollen, dass das ganze Geld ins Ausland fliesst?» Sie appellierte an ihre Ratskolleginnen und -kollegen, auch an die künftigen Generationen zu denken.',
'Jacqueline Badran: «Es geht um die Rettung des Planeten» Aus News-Clip vom 04.12.2018.', 'Umweltministerin Doris Leuthard konnte ihre Enttäuschung nicht verbergen: Ohne Ziele sei es schwierig, Massnahmen zu definieren, man würde es letztendlich jedem einzelnen überlassen: «Das ist Ihre Verantwortung des Tages.»', 'Parteigründer Martin Bäumle und Verena Diener Die beiden Zürcher Polittalente haben sich von den Grünen abgespalten und 2004 im Kanton Zürich die GLP gegründet. Beide waren jahrelang die Aushängeschilder der Partei. Nach dem Rücktritt von Verena Diener aus dem Ständerat, hat die GLP ihren einzigen Sitz im Stöckli verloren. Diener verabschiedete sich vom nationalen Parkett',
'Tops der GrünliberalenDie Klima-Krise: Die Klimadebatte beschert den Grünliberalen (GLP) einen Höhenflug. Bei den kantonalen Parlamentswahlen hat die GLP weiter zugelegt. Allein bei den Zürcher Kantonsratswahlen gewann die Partei 9 Sitze hinzu. Mit schweizweit insgesamt 98 Mandaten hat die Partei einen neuen Höchststand erreicht.Überläufer: Mit dem Parteiwechsel von Chantal Galladé von der SP zu den Grünliberalen, gelang der Partei ein Coup. Nur kurze Zeit später kehrte auch der national weniger bekannte Zürcher Daniel Frei den Genossen den Rücken und wurde Mitglied der GLP.Die «Ehe für Alle»: Die Grünliberalen waren es, die mit einer parlamentarischen Initiative «Die Ehe für Alle» auch in der Schweiz angestossen haben. Mit dem Thema rechtliche Gleichstellung kann die GLP beim urbanen, offenen, hippen Wählersegment punkten.',
'Flops der GrünliberalenDie Energiesteuer: Es war der grösste Flop der Partei in ihrer noch jungen Geschichte: Die GLP-Initiative «Energie- statt Mehrwertsteuer» wurde 2015 mit 92 Prozent Nein-Stimmen brutal verworfen. Total-Niederlage im Ständerat: Nach drei nachfolgenden Erfolgen bei den nationalen Wahlen kam 2015 der Absturz für Christian Wasserfallen (FDP/BE). Die GLP büsste fünf ihrer zwölf Nationalratssitze ein, im Ständerat ist sie gar nicht mehr vertreten. Den Sitz von Verena Diener konnte die Partei nicht halten.Niederlagen bei Finanzvorlagen: Bei Steuer- und AHV-Fragen scheint die Partei am Volk vorbei zu politisieren. So waren die Grünliberalen für die Unternehmenssteuerreform III, das Volk dagegen. Dafür sagte das Stimmvolk Ja zur STAF-Vorlage, welche die Steuerreform mit der AHV-Finanzierung verknüpfte. Die GLP war strikte dagegen.',
'Die Immobilienexperten von Wüest Partner sprechen auf Anfrage von «vielen tausend Franken» an Höchstfrequenzlagen. Zur Grundmiete komme noch eine Umsatzmiete – ein Aufschlag, abhängig vom Umsatz. Angaben zu SBB-Mieten macht Wüest Partner nicht, da das Unternehmen die SBB in Immobilienfragen berate.',
'Ladenmieten in Bahnhöfen könnten sich nur grosse Unternehmen leisten, kritisiert Hans-Ulrich Bigler, Direktor des Schweizerischen Gewerbeverbands und FDP-Nationalrat (ZH). Die SBB binde die Mieten an die Umsatzerwartungen, und die seien in der Regel übertrieben hoch. «KMU haben gar keine Chance, an diesen interessanten Lagen ihr Geschäft aufzumachen.»',
'Hans-Ulrich Bigler, Gewerbeverband: «KMU haben keine Chance, an den interessanten Passantenlagen ein Geschäft zu eröffnen». Aus ECO vom 25.02.2019.',
'Die SBB wehrt sich gegen den Vorwurf der Gewinnmaximierung. Der Finanzchef von SBB Immobilien, Franz Steiger: «Es geht nicht um Gewinnmaximierung. Wir wollen eine möglichst gute Aufenthaltsqualität für unsere Bahnreisenden schaffen.» Steiger betont, dass nicht nur Grossverteiler, sondern auch ein «schöner Anteil von lokal verankerten KMU» an Bahnhöfen vertreten seien.',
'Grüne und Linke fordern, dass Schweizer Bauern ihren Nutztierbestand um einen Viertel reduzieren.', 'Die Bevölkerung soll weniger Fleisch und vermehrt pflanzenbasiert essen.',
'Der Futterbedarf für die Fleischproduktion sei zu hoch und bedrohe den Regenwald, sagt Nationalrat Bastien Girod (Grüne/ZH).',
'Bauernvertreter sind verärgert. Fleisch werde immer mehr wie Zigaretten behandelt, sagt Nationalrat Mike Egger (SVP/SG).',
'Der Kampf gegen den Klimawandel erreicht unsere Esstische. «Der heutige Fleischkonsum ist nicht nachhaltig», kritisiert Nationalrat Bastien Girod von den Grünen. Deshalb fordern er und seine Partei einen raschen und tiefgreifenden Umbau der Landwirtschaft: Die Schweizer Bauern sollen ihren Tierbestand in nur zehn Jahren um einen Viertel reduzieren.',
'«Wir wollen keine Massentierhaltung und keine Futtermittelimporte mehr», erklärt der Umweltwissenschaftler und Nationalrat gegenüber der «Rundschau» die radikale Forderung. Der hohe Sojabedarf der industriellen Tiermast bedrohe die Regenwälder und das Methan aus dem Magen der Rinder sei ein besonders schädliches Treibhausgas.')
articles <- data.frame(article_id=1:length(artText), text=artText, stringsAsFactors=FALSE)
str(articles)
# Construct a pattern that searches for all politicians
polit_pattern <- glue::glue_collapse(politicians, sep = "|")
# Use the pattern to match all names in the column "text"
articles <- articles %>%
mutate(mentions = str_match_all(text, pattern=polit_pattern))
# Collapse all items of the column "text"
all_articles_in_one <- glue::glue_collapse(articles$text)
# Pass the vector politicians to count all its elements
str_count(all_articles_in_one, pattern=politicians)
# Familiarize yourself with users by printing its contents
print(usersVec)
advanced_pattern <- glue::glue_collapse(c(
# Match one or more alphabetical letters
"username" = "^[A-Za-z]+",
": ",
# Match one or more digit
"logins" = "\\d+",
", ",
# Match one or more arbitrary characters
"email" = ".+$"
))
str_view(usersVec, advanced_pattern)
Chapter 3 - Extracting Structured Data From Text
Capturing Groups:
Tidyr Extract:
Extracting Matches and Surrounding from Text:
Example code includes:
top_10 <- c("1. Karate Kid\n2. The Twilight Saga: Eclispe\n3. Knight & Day\n4. Shrek Forever After 3D\n5. Marmaduke.\n6. Street Dance\n7. Predators\n8. StreetDance 3D\n9. Robin Hood\n10. Micmacs A Tire-Larigot")
# Split the input by line break and enable simplify
top_10_lines <- str_split(top_10, pattern = "\\n", simplify = TRUE)
# Inspect the first three lines and analyze their form
top_10_lines[1:3]
# Add to the pattern two capturing groups that match rank and title
str_match(top_10_lines, pattern = "(\\d+)\\. (.+)")
# Remove a space followed by "3D" at the end of the line
str_replace(top_10_lines, pattern = " 3D", replacement = "")
# Use backreferences 2 and 1 to create a new sentence
str_replace(top_10_lines, pattern = "(\\d+)\\. (.*)", replacement = "\\2 is at rank \\1")
sLine <- c('Movie Title Distributor Screens',
'Karate Kid WDSMP 58',
'Twilight Saga, The: Eclispe Elite 91',
'Knight & Day Fox 50',
'Shrek Forever After (3D) Universal 63',
'Marmaduke Fox 33',
'Predators Fox 26',
'StreetDance (3D) Rialto 11',
'Robin Hood Universal 9',
'Micmacs A Tire-Larigot Pathé 4',
'Sex And the City 2 WB 12',
'Inception WB 24',
'Toy Story 3 In Disney Digital 3D WDSMP 25',
'Shrek Forever After (3D) Universal 22',
'Twilight Saga, The: Eclispe Elite 27',
'Predators Fox 9',
'Italien, Le Pathé 6',
'Tournee Agora 5',
'A-Team, The Fox 5',
'El Secreto De Sus Ojos Xenix 3',
'Kiss & Kill Frenetic 4',
'Toy Story 3 In Disney Digital 3D WDSMP 5',
'Twilight Saga, The: Eclispe Elite 4',
'Predators Fox 4',
'Road, The Elite 1',
'Robin Hood Universal 1',
'Cosa Voglio Di Piu Filmcoopi 1',
'Prince Of Persia: The Sands Of Time WDSMP 1',
'Saw 6 Elite 1'
)
screens_per_movie <- data.frame(file_source=rep(c("02_11_1", "02_11_2"), times=c(9, 20)), line=sLine,
stringsAsFactors=FALSE
)
screens_per_movie
extract(
screens_per_movie,
line,
into = c("is_3d", "screens"),
# Capture two groups: "3D" and "one or more digits"
regex = "(3D).*?(\\d+)$",
# Pass TRUE or FALSE, the original column should not be removed
remove = FALSE,
# Pass TRUE or FALSE, the result should get converted to numbers
convert = TRUE
)
# Print the first three lines of screens_per_movie
screens_per_movie[1:3, ]
# Match anything, one or more word chars and one or more digits
str_match(
screens_per_movie[3, ]$line,
"(.*)\\s{2,}(\\w+)\\s{2,}(\\d+)"
)
# Extract the column line into title, distributor, screens
extract(
screens_per_movie,
col = line,
into = c("title", "distributor", "screens"),
regex = "(.*)\\s{2,}(\\w+)\\s{2,}(\\d+)"
)
# Create our polit_pattern again by collapsing "politicians"
polit_pattern <- glue::glue_collapse(politicians, sep = "|")
# Match one or more word characters or punctuations
context <- "([\\w[:punct:]]+\\s){0,10}"
# Add this pattern in front and after the polit_pattern
polit_pattern_with_context <- glue::glue("{context}({polit_pattern})\\s?{context}")
str_extract_all(articles$text, pattern = polit_pattern_with_context)
Chapter 4 - Similarities Between Strings
Understanding String Distances:
Methods of String Distances:
Fuzzy Joins:
Custom Fuzzy Matching:
Wrap Up:
Example code includes:
usernames <- c("Max Power", "Emilie Brown", "Max Mustermann")
# Search usernames with a maximum edit distance of 1
closest_index <- stringdist::amatch(x = "Emile Brown", table = usernames, maxDist = 1, method = "lv")
# Print the matched name in usernames at closest_index
print(glue::glue("Did you mean {name_matched}?", name_matched = usernames[closest_index]))
search <- "Mariah Carey"
names <- c("M. Carey", "Mick Jagger", "Michael Jackson")
# Pass the values 1 and 2 as "q" and inspect the qgrams
stringdist::qgrams("Mariah Carey", "M. Carey", q = 1)
stringdist::qgrams("Mariah Carey", "M. Carey", q = 2)
# Try the qgram method on the variables search and names
stringdist::stringdist(search, names, method = "qgram", q = 1)
stringdist::stringdist(search, names, method = "qgram", q = 2)
# Try the default method (osa) on the same input and compare
stringdist::stringdist(search, names, method = "osa")
UIuser_input <- c('Hussein Perry', 'Agata Kit', 'Ayoub', 'Rodrigues Partridge', 'Haiden Cambpell', 'Harpret Pennington', 'Malakai Coles', 'Lola-Rose Houston', 'Efreim anderson', 'Hugh Aston', 'Eleanor Hussein', 'Melodye Doherty', 'Avneet Simonds', 'Ayush Reed', 'Emilie Robrts', 'Emet Vo', 'Koby Emery', 'Latoya Weber', 'Kira Dugan', 'Cunningham Jan', 'Conar Small', 'Rivka Ferraira Lopez', 'Eliot Buckanan', 'Ioussef Austin', 'Kai Hyas', 'Anwen Firth Meyer', 'FardeenRatliff', 'Roscoe Grifith', 'Lillie Mai Bannister', 'A. Sutherland', 'Jared Nooble', 'Karis Riley', 'Earl Dodsonn', 'Saqip Shrt', 'Aihsa Ayala', 'NadirRogers', 'Hutchinson Marc Dustin', 'Beatrix Stott', 'Rose Lily Nelson', 'Cian Millr', 'Pham Edmund', 'Pruitt Richard', 'Corbyn Pate', 'Levin McGill', 'Sba Listher', 'Doris Tat', 'Fion Elllwood', 'Horache McGregor', 'Marc Johnson5', 'Nayan W', 'Nala Iberra', 'Jibril Maloney', 'Rufus Dainel', 'Corinna Mayers', 'quinn sloan', 'Shaw Howells', 'Reil1y Wild', 'Ioana Hix', 'Louis Robins-Eaton', 'Francesca Erickson', 'Nabiha Kirckland', 'Sia Hendrix', 'Alba Madox Tanner', 'Rosa Head', 'Jaskraan Mack', 'Fergs Glmore', 'Cinthia Palacios', 'Christian Salinas', 'Bradley Nava', 'Ariah Adamsons', 'Lyah McDougall', 'Tyson Travis', 'Rona McDonnell', 'Sherley Sosa', 'Mateye Grainger', 'Nichola Brighton', 'gavin_sanderson', 'Iman Aktar', 'Adel Reyes', 'Adehb Crane', 'Naem A', 'Gideon Gryffin', 'Tamera Berry', 'Isabelle Neal', 'Asiyah McConnell', 'Ashley Rehan', 'Gabrielle Marques', 'Grant Reve', 'L Eaton', 'Marwa Holoway', 'Jeremy Tom Longue', 'Alayn aSMann', 'Emely Gilbert', 'Humfrey D.', 'Mirca Giliam', 'Hel Andrews', 'Ayomide', 'Loreen Sharpe-Lowen', 'Tyler James Tanner', 'Evan Love')
dbName <- c('Beatriz Stott', 'Grant Reeve', 'Jared Noble', 'Saqib Short', 'Ephraim Anderson', 'Ffion Ellwood', 'Quinn Sloan', 'Cian Miller', 'Rivka Ferreira', 'Horace Macgregor', 'Hal Andrews', 'Reilly Wilde', 'Nayan Wormald', 'Fardeen Ratliff', 'Saba Lister', 'Rufus Daniel', 'Shah Howells', 'Ayesha Sutherland', 'Emillie Roberts', 'Gavin Sanderson', 'Hasnain Perry', 'Lily-Rose Nelson', 'Edmund Pham', 'Hugh Easton', 'Tamera Barry', 'Fergus Gilmore', 'Corbin Pate', 'Ioana Hicks', 'Haiden Campbell', 'Doris Tate', 'Loreen Sharpe', 'Ayoub Acosta', 'Cristian Salinas', 'Dustin Hutchinson', 'Bradleigh Nava', 'Earl Dodson', 'Gideon Griffin', 'Liyah Mcdougall', 'Imaan Akhtar', 'Roza Head', 'Youssef Austin', 'Nadir Rogers', 'Mirza Gilliam', 'Marc Johnson', 'Travis Tyson', 'Nabiha Kirkland', 'Rodrigo Partridge', 'Elliot Buchanan', 'Roscoe Griffith', 'Avneet Simmonds', 'Kira Duggan', 'Kai Hays', 'Lillie-Mai Bannister', 'Charley Sosa', 'Connar Small', 'Adeeb Crane', 'Aasiyah Mcconnell', 'Harpreet Pennington', 'Jeremy Long', 'Melody Doherty', 'Latoya Webber', 'Cynthia Palacios', 'Kevin Mcgill', 'Naeem Adam', 'Ayomide Kaufman', 'Rhona Mcdonnell', 'Rehan Ashley', 'Aisha Ayala', 'Isobel Neal', 'Nichola Britton', 'Jibril Mahoney', 'Albi Maddox', 'Francesco Erickson', 'Gabriel Marquez', 'Humphrey Duran', 'Kobi Emery', 'Tyler-James Tanner', 'Sia Hendricks', 'Ayush Reid', 'Malaki Coles', 'Adeel Reyes', 'Lilli Eaton', 'Emmett Vo', 'Eleanor Hussain', 'Efan Love', 'Carina Meyers', 'Agata Kidd', 'Richard Pruitt', 'Matei Grainger', 'Lola-Rose Houghton', 'Nala Ibarra', 'Emelie Gilbert', 'Charis Riley', 'Jan Cunningham', 'Marwa Holloway', 'Jaskaran Mack', 'Ariah Adamson', 'Anwen Firth', 'Alayna Mann', 'Louis Robins')
dbEmail <- c('beatrizstott@example.com', 'grant-reeve@example.com', 'jared.noble@example.com', 'saqib_short@example.com', 'ephraim-anderson@example.com', 'ffion.ellwood@example.com', 'quinn_sloan@example.com', 'cianmiller@example.com', 'rivkaferreira2@example.com', 'horace.macgregor@example.com', 'hal.andrews@example.com', 'reilly_wilde@example.com', 'nayanwormald@example.com', 'fardeen.ratliff@example.com', 'saba-lister@example.com', 'rufus_daniel@example.com', 'showells@example.com', 'ayeshasutherland@example.com', 'emillie.r@example.com', 'gavin_sanderson@example.com', 'hasnainperry@example.com', 'lily.r.nelson@example.com', 'edmund_pham@example.com', 'hugh.easton@example.com', 'tamerabarry@example.com', 'fergusg2@example.com', 'cpate@example.com', 'ihicks@example.com', 'haiden.campbell@example.com', 'doris_tate@example.com', 'loreen.sharpe@example.com', 'ayoub-acosta@example.com', 'cristian-salinas@example.com', 'dustin-hutchinson@example.com', 'bradleigh-nava@example.com', 'earl-dodson@example.com', 'gideon-griffin@example.com', 'liyah-mcdougall@example.com', 'imaan-akhtar@example.com', 'roz@example.com', 'youssef-austin@example.com', 'nadir.rogers@example.com', 'mirza.g@example.com', 'marc.johnson@example.com', 'travis-tyson@example.com', 'nabihakirkland@example.com', 'rodrigo.partridge@example.com', 'elliot.buchanan@example.com', 'roscoe.griffith@example.com', 'avneet-simmonds@example.com', 'kira-duggan@example.com', 'kai-hays@example.com', 'lillie-mai-bannister@example.com', 'c-sosa@example.com', 'connarsmall@example.com', 'adeeb.crane@example.com', 'aasiyah-mcconnell@example.com', 'harpreet-pennington@example.com', 'jeremy-long@example.com', 'melody.doherty@example.com', 'latoya.webber@example.com', 'cynthiapalacios@example.com', 'kevinmcgill@example.com', 'naeem.a@example.com', 'ayomide-kaufman@example.com', 'rhonamcdonnell@example.com', 'rehan.a@example.com', 'aisha-ayala@example.com', 'isobel-neal@example.com', 'nichola.britton@example.com', 'jibril-mahoney@example.com', 'albi.m@example.com', 'francescoerickson@example.com', 'gabriel.m@example.com', 'humphrey-duran@example.com', 'kobi-emery@example.com', 'tyler.j.tanner@example.com', 'sia-hendricks@example.com', 'ayush_reid@example.com', 'malaki-coles@example.com', 'adeel_reyes@example.com', 'lilli-eaton@example.com', 'emmett-vo@example.com', 'eleanor-hussain@example.com', 'efan-love@example.com', 'carina-meyers@example.com', 'agata-kidd@example.com', 'richard-pruitt@example.com', 'matei_grainger@example.com', 'lola-rose_houghton@example.com', 'nalaibarra@example.com', 'emelie-gilbert@example.com', 'charisriley@example.com', 'jancunningham@example.com', 'marwa-holloway@example.com', 'jaskaran-mack@example.com', 'ariah-adamson@example.com', 'anwenfirth@example.com', 'alaynamann@example.com', 'louis-robins@example.com')
user_input <- tibble::tibble(user_input=UIuser_input)
database <- tibble::tibble(name=dbName, email=dbEmail)
# Join the data frames on a maximum string distance of 2
joined <- fuzzyjoin::stringdist_join(
user_input,
database,
by = c("user_input" = "name"),
max_dist = 3,
distance_col = "distance",
ignore_case = TRUE
)
# Print the number of rows of the newly created data frame
print(glue::glue("{n} out of 100 names were matched successfully", n = nrow(joined)))
movie_titles <- tibble::tibble(title=c("mama", "ma loute", "ma vie de gourgette", "maggies plan", "magnus", "manifesto", "maps to thes tars", "maud1e", "mehr ais liebe", "mercenaire"), year=2014+c(0, 2, 2, 2, 1, 0, 0, 2, 1, 2))
movie_db <- tibble::tibble(title=c('m.s. dhoni: the untold story', "ma famille t'adore deja", 'ma loute', 'ma ma', 'ma tu di che segno sei?', 'ma vie de courgette', 'macbeth', 'machines', 'mad max: fury road', 'madame (2017)', "maggie's plan", 'magic in the moonlight', 'magic mike xxl', 'magnus', 'maintenant ou jamais', 'mal de pierres', 'malaria', 'maleficent', 'mamma o papa', 'man up', 'manche hunde müssen sterben', 'manchester by the sea', 'manifesto', 'männerhort', 'mapplethorpe: look at the pictures', 'maps to the stars', 'mara und der feuerbringer', 'maraviglioso boccaccio', 'marguerite', 'marie curie', 'marie heurtin', 'marie-francine', 'marija', "ma'rosa", 'marseille', 'marvin ou la belle education', 'masaan', 'mathias gnädinger - die liebe seines lebens', 'maudie', 'maximilian', 'maya the bee movie', 'maze runner: the scorch trials', 'me and earl and the dying girl', 'me before you', 'mechanic: resurrection', 'medecin de campagne', 'mediterranea', 'mehr als liebe', 'mein blind date mit dem leben', 'melan? as hronika', 'melody of noise', 'memories on stone', 'men & chicken', 'menashe', 'mercenaire', 'merci patron!', 'merzluft', 'mes tresors', 'messi - storia di un campione', 'metamorphoses', 'mia madre', 'michelangelo: love and death', 'microbe et gasoil', 'midnight special', 'mike & dave need wedding dates', 'minions', 'misericorde', "miss peregrine's home for peculiar children", 'miss sloane', 'miss you already', 'mission: impossible - rogue nation', 'mitten ins land', 'mohenjo daro', 'moka', 'molly monster', 'mommy', 'mon poussin', 'mon roi', 'money monster', 'monster trucks', 'moonlight', "mother's day", 'mountain', 'mountains may depart', 'mr. gaga', 'mr. holmes', 'mr. kaplan', 'mr. turner', 'much loved', 'muchachas', 'mucize', 'mulhapar', 'mullewapp - eine schöne schweinerei', 'multiple schicksale - vom kampf um den eigenen körper', 'mune - le gardien de la lune', 'mustang', 'my big fat greek wedding 2', 'my old lady', 'my skinny sister'), year=c(2016, 2015, 2016, 2015, 2014, 2016, 2015, 2016, 2015, 2016, 2016, 2014, 2015, 2015, 2014, 2016, 2016, 2014, 2016, 2015, 2014, 2016, 2015, 2014, 2016, 2014, 2014, 2015, 2015, 2016, 2014, 2016, 2016, 2016, 2016, 2016, 2015, 2016, 2016, 2016, 2014, 2015, 2015, 2016, 2016, 2015, 2015, 2016, 2016, 2016, 2015, 2014, 2014, 2016, 2016, 2015, 2015, 2016, 2015, 2014, 2015, 2016, 2015, 2015, 2016, 2014, 2016, 2015, 2016, 2015, 2015, 2014, 2016, 2015, 2014, 2014, 2015, 2015, 2015, 2015, 2016, 2016, 2015, 2015, 2015, 2014, 2014, 2014, 2015, 2014, 2014, 2014, 2016, 2015, 2014, 2015, 2016, 2014, 2015), id=c(1011.563, 1011.242, 1011.129, 1010.849, 1010.542, 1011.209, 1010.688, 1012.275, 1009.914, 1011.785, 1011.1, 1010.145, 1010.211, 1011.612, 1010.379, 1011.308, 1012.409, 1009.536, 1011.827, 1010.812, 1010.454, 1011.294, 1012.107, 1010.155, 1011.427, 1010.056, 1010.156, 1011.127, 1010.763, 1011.609, 1010.223, 1011.654, 1011.469, 1011.617, 1011.107, 1012.155, 1010.7, 1011.222, 1011.353, 1012.108, 1009.999, 1010.443, 1010.694, 1010.819, 1010.625, 1011.137, 1010.912, 1011.87, 1011.406, 1012.914, 1011.15, 1010.471, 1010.347, 1012.231, 1011.688, 1011.352, 1010.654, 1011.397, 1010.833, 1010.621, 1010.68, 1012.294, 1010.803, 1010.234, 1010.595, 1009.253, 1011.673, 1009.71, 1011.564, 1011.055, 1009.907, 1010.129, 1011.494, 1011.36, 1010.841, 1010.289, 1011.667, 1010.604, 1011.206, 1009.753, 1011.754, 1010.95, 1011.278, 1010.887, 1011.426, 1010.627, 1010.523, 1010.256, 1011.065, 1010.58, 1010.452, 1010.426, 1011.354, 1010.939, 1010.56, 1010.94, 1010.894, 1010.275, 1011.026))
is_string_distance_below_three <- function(left, right) {
stringdist::stringdist(left, right) < 3
}
is_closer_than_three_years <- function(left, right) {
abs(left - right) < 3
}
# Join by "title" and "year" with our two helper functions
fuzzyjoin::fuzzy_left_join(
movie_titles, movie_db,
by = c("title", "year"),
match_fun = c("title" = is_string_distance_below_three, "year" = is_closer_than_three_years)
)